@@ -32,6 +32,52 @@ convert_omniscient_to_ensembl_style write_top_features prepare_gffout prepare_fi
3232
3333=cut
3434
35+ # +------------------------------------------------------+
36+ # |+----------------------------------------------------+|
37+ # || Batched Writer Helper Class ||
38+ # |+----------------------------------------------------+|
39+ # +------------------------------------------------------+
40+
41+ # Helper class to batch write_feature calls for better performance
42+ package AGAT::BatchedGFFWriter ;
43+
44+ sub new {
45+ my ($class , $gffout , $batch_size ) = @_ ;
46+ $batch_size //= 1000;
47+ return bless {
48+ gffout => $gffout ,
49+ batch => [],
50+ batch_size => $batch_size
51+ }, $class ;
52+ }
53+
54+ sub write_feature {
55+ my ($self , @features ) = @_ ;
56+ return unless @features ;
57+
58+ push @{$self -> {batch }}, @features ;
59+
60+ # Flush if batch is full
61+ if (@{$self -> {batch }} >= $self -> {batch_size }) {
62+ $self -> flush();
63+ }
64+ }
65+
66+ sub flush {
67+ my ($self ) = @_ ;
68+ return unless @{$self -> {batch }};
69+
70+ $self -> {gffout }-> write_feature(@{$self -> {batch }});
71+ $self -> {batch } = [];
72+ }
73+
74+ sub DESTROY {
75+ my ($self ) = @_ ;
76+ $self -> flush(); # Auto-flush on destruction
77+ }
78+
79+ package AGAT::OmniscientO ;
80+
3581# +------------------------------------------------------+
3682# |+----------------------------------------------------+|
3783# || Print Methods ||
@@ -68,6 +114,8 @@ sub prepare_gffout{
68114 $gffout = AGAT::BioperlGFF-> new(-fh => \*STDOUT , -type => $CONFIG -> {output_format }, -version => $version );
69115 }
70116
117+ dual_print1 " => Output format will be " .uc ($CONFIG -> {output_format }).$version ." .\n " ;
118+
71119 return $gffout ;
72120}
73121
@@ -150,6 +198,9 @@ sub print_omniscient_as_gff{
150198 if ( defined ($args -> {output })) {$gffout = $args -> {output };} else { print " Output parameter mandatory to use print_omniscient_as_gff!" ; exit ; }
151199 # -----------------------------------
152200
201+ # Wrap gffout with batched writer
202+ my $writer = AGAT::BatchedGFFWriter-> new($gffout , 1000);
203+
153204 # uri_decode_omniscient($omniscient);
154205
155206 # --------- deal with header --------------
@@ -195,7 +246,7 @@ sub print_omniscient_as_gff{
195246 if ( exists_keys( \%tabix_hash , ($startpos , ' level1' ) ) ){
196247 foreach my $level1_ID ( sort {$a cmp $b } keys %{$tabix_hash {$startpos }{' level1' }} ){
197248 foreach my $ptag_l1 ( sort {$a cmp $b } keys %{$tabix_hash {$startpos }{' level1' }{$level1_ID }} ){
198- $gffout -> write_feature($omniscient -> {' level1' }{$ptag_l1 }{$level1_ID }); # print feature
249+ $writer -> write_feature($omniscient -> {' level1' }{$ptag_l1 }{$level1_ID });
199250 }
200251 }
201252 }
@@ -205,7 +256,7 @@ sub print_omniscient_as_gff{
205256 foreach my $ptag_l2 ( sort {$a cmp $b } keys %{$tabix_hash {$startpos }{' level2' }{$level1_ID }{$level2_ID } } ){
206257 foreach my $feature_level2 ( @{$omniscient -> {' level2' }{$ptag_l2 }{$level1_ID }}) {
207258 if (lc ($feature_level2 -> _tag_value(' ID' )) eq $level2_ID ){
208- $gffout -> write_feature($feature_level2 ); # print feature
259+ $writer -> write_feature($feature_level2 );
209260 last ;
210261 }
211262 }
@@ -220,7 +271,7 @@ sub print_omniscient_as_gff{
220271 foreach my $feature_level3 ( @{$omniscient -> {' level3' }{$ptag_l3 }{$level2_ID } } ) {
221272 # check the start also because spreadfeatures like CDS can share same ID
222273 if (lc ($feature_level3 -> _tag_value(' ID' )) eq $level3_ID and $startpos == $feature_level3 -> start()){
223- $gffout -> write_feature($feature_level3 ); # print feature
274+ $writer -> write_feature($feature_level3 );
224275 last ;
225276 }
226277 }
@@ -229,6 +280,7 @@ sub print_omniscient_as_gff{
229280 }
230281 }
231282 }
283+ # Flush is automatic via DESTROY when $writer goes out of scope
232284 }
233285 else {
234286
@@ -248,22 +300,20 @@ sub print_omniscient_as_gff{
248300 # ################
249301 # == LEVEL 1 == # IF not in omniscient do that, otherwise we us within. Make a method for it.
250302 # ################
251- write_top_features($gffout , $seqid , $hash_sortBySeq_topf , $omniscient );
303+ write_top_features($writer , $seqid , $hash_sortBySeq_topf , $omniscient );
252304
253305 foreach my $locationid ( sort { ncmp ($a , $b ) } keys %{$hash_sortBySeq -> {$seqid } } ){
254306
255307 my $primary_tag_l1 = $hash_sortBySeq -> {$seqid }{$locationid }{' tag' };
256308 my $id_tag_key_level1 = $hash_sortBySeq -> {$seqid }{$locationid }{' id' };
257- $gffout -> write_feature($omniscient -> {' level1' }{$primary_tag_l1 }{$id_tag_key_level1 }); # print feature
258-
259- # ################
309+ $writer -> write_feature($omniscient -> {' level1' }{$primary_tag_l1 }{$id_tag_key_level1 });
260310 # == LEVEL 2 == #
261311 # ################
262312 foreach my $primary_tag_l2 (sort {$a cmp $b } keys %{$omniscient -> {' level2' }}){ # primary_tag_l2 = mrna or mirna or ncrna or trna etc...
263313
264314 if ( exists_keys( $omniscient , (' level2' , $primary_tag_l2 , $id_tag_key_level1 ) ) ){
265315 foreach my $feature_level2 ( sort { ($a -> start <=> $b -> start) || ($a -> end <=> $b -> end) || ncmp(lc ($a -> _tag_value(' ID' )), lc ($b -> _tag_value(' ID' ))) } @{$omniscient -> {' level2' }{$primary_tag_l2 }{$id_tag_key_level1 }}) {
266- $gffout -> write_feature($feature_level2 );
316+ $writer -> write_feature($feature_level2 );
267317
268318 # ################
269319 # == LEVEL 3 == #
@@ -278,14 +328,17 @@ sub print_omniscient_as_gff{
278328 else {
279329 warn " Cannot retrieve the parent feature of the following feature: " .gff_string($feature_level2 );
280330 }
281- print_level3_old_school( {omniscient => $omniscient , level2_ID => $level2_ID , output => $gffout } );
331+ print_level3_old_school( {omniscient => $omniscient , level2_ID => $level2_ID , output => $writer } );
282332 }
283333 }
284334 }
285335 }
286336 }
287337 }
288338
339+ # Ensure all batched features are written
340+ $writer -> flush();
341+
289342 # --------- deal with fasta seq --------------
290343 write_fasta($gffout );
291344}
@@ -303,6 +356,9 @@ sub print_omniscient_as_match{
303356 if ( defined ($args -> {output })) {$gffout = $args -> {output };} else { print " Output parameter mandatory to use print_omniscient_as_match!" ; exit ; }
304357 # -----------------------------------
305358
359+ # Wrap gffout with batched writer
360+ my $writer = AGAT::BatchedGFFWriter-> new($gffout , 1000);
361+
306362 # uri_decode_omniscient($omniscient);
307363
308364 # --------- deal with header --------------
@@ -318,15 +374,15 @@ sub print_omniscient_as_match{
318374 # == LEVEL 1 == #
319375 # ################
320376
321- write_top_features($gffout , $seqid , $hash_sortBySeq_topf , $omniscient );
377+ write_top_features($writer , $seqid , $hash_sortBySeq_topf , $omniscient );
322378
323379 foreach my $locationid ( sort { ncmp ($a , $b ) } keys %{$hash_sortBySeq -> {$seqid } } ){
324380
325381 my $primary_tag_l1 = $hash_sortBySeq -> {$seqid }{$locationid }{' tag' };
326382 my $id_tag_key_level1 = $hash_sortBySeq -> {$seqid }{$locationid }{' id' };
327383
328384 if ($primary_tag_l1 =~ " match" ){
329- $gffout -> write_feature($omniscient -> {' level1' }{$primary_tag_l1 }{$id_tag_key_level1 }); # print feature
385+ $writer -> write_feature($omniscient -> {' level1' }{$primary_tag_l1 }{$id_tag_key_level1 }); # print feature
330386 }
331387 # ################
332388 # == LEVEL 2 == #
@@ -337,15 +393,15 @@ sub print_omniscient_as_match{
337393 foreach my $feature_level2 ( sort { ($a -> start <=> $b -> start) || ($a -> end <=> $b -> end) || ncmp(lc ($a -> _tag_value(' ID' )), lc ($b -> _tag_value(' ID' ))) } @{$omniscient -> {' level2' }{$primary_tag_l2 }{$id_tag_key_level1 }}) {
338394
339395 if ($primary_tag_l2 =~ " match" ){
340- $gffout -> write_feature($feature_level2 );
396+ $writer -> write_feature($feature_level2 );
341397 }
342398 else {
343399 $feature_level2 -> primary_tag(' match' );
344400 if ( $feature_level2 -> has_tag(' Parent' )){
345401 $feature_level2 -> remove_tag(' Parent' );
346402 }
347403
348- $gffout -> write_feature($feature_level2 );
404+ $writer -> write_feature($feature_level2 );
349405
350406 # ################
351407 # == LEVEL 3 == #
@@ -368,7 +424,7 @@ sub print_omniscient_as_match{
368424 }
369425 $current_start =$end ;
370426
371- $gffout -> write_feature($feature_level3 );
427+ $writer -> write_feature($feature_level3 );
372428 }
373429 }
374430 }
@@ -395,6 +451,9 @@ sub print_omniscient_from_level1_id_list {
395451 if ( defined ($args -> {output })) {$gffout = $args -> {output };} else { print " Output parameter mandatory to use print_omniscient_from_level1_id_list!" ; exit ; }
396452 # -----------------------------------
397453
454+ # Wrap gffout with batched writer
455+ my $writer = AGAT::BatchedGFFWriter-> new($gffout , 1000);
456+
398457 # uri_decode_omniscient($omniscient);
399458
400459 # --------- deal with header --------------
@@ -409,7 +468,7 @@ sub print_omniscient_from_level1_id_list {
409468 # ################
410469 # == LEVEL 1 == #
411470 # ################
412- write_top_features($gffout , $seqid , $hash_sortBySeq_topf , $omniscient );
471+ write_top_features($writer , $seqid , $hash_sortBySeq_topf , $omniscient );
413472
414473 foreach my $locationid ( sort { ncmp ($a , $b ) } keys %{$hash_sortBySeq -> {$seqid } } ){
415474
@@ -418,7 +477,7 @@ sub print_omniscient_from_level1_id_list {
418477
419478 # _uri_encode_one_feature($omniscient->{'level1'}{$primary_tag_l1}{$id_tag_key_level1});
420479
421- $gffout -> write_feature($omniscient -> {' level1' }{$primary_tag_l1 }{$id_tag_key_level1 }); # print feature
480+ $writer -> write_feature($omniscient -> {' level1' }{$primary_tag_l1 }{$id_tag_key_level1 }); # print feature
422481
423482 # ################
424483 # == LEVEL 2 == #
@@ -430,7 +489,7 @@ sub print_omniscient_from_level1_id_list {
430489
431490 # _uri_encode_one_feature($feature_level2);
432491
433- $gffout -> write_feature($feature_level2 );
492+ $writer -> write_feature($feature_level2 );
434493
435494 # ################
436495 # == LEVEL 3 == #
@@ -446,7 +505,7 @@ sub print_omniscient_from_level1_id_list {
446505 warn " Cannot retrieve the parent feature of the following feature: " .gff_string($feature_level2 );
447506 }
448507
449- print_level3_old_school( {omniscient => $omniscient , level2_ID => $level2_ID , output => $gffout } );
508+ print_level3_old_school( {omniscient => $omniscient , level2_ID => $level2_ID , output => $writer } );
450509
451510 }
452511 }
@@ -466,10 +525,10 @@ sub print_level3_old_school{
466525 # Check we receive a hash as ref
467526 if (ref ($args ) ne ' HASH' ){ warn " Hash Arguments expected for print_level3_old_school. Please check the call.\n " ;exit ; }
468527 # Fill the parameters
469- my ($omniscient , $level2_ID , $gffout );
528+ my ($omniscient , $level2_ID , $writer );
470529 if ( defined ($args -> {omniscient })) {$omniscient = $args -> {omniscient };} else { print " Omniscient parameter mandatory to use print_level3_old_school!" ; exit ; }
471530 if ( defined ($args -> {level2_ID })) {$level2_ID = $args -> {level2_ID };} else { print " level2_ID parameter mandatory to use print_level3_old_school!" ; exit ; }
472- if ( defined ($args -> {output })) {$gffout = $args -> {output };} else { print " Output parameter mandatory to use print_level3_old_school!" ; exit ; }
531+ if ( defined ($args -> {output })) {$writer = $args -> {output };} else { print " Output parameter mandatory to use print_level3_old_school!" ; exit ; }
473532 # -----------------------------------
474533
475534 # -------------- Params --------------
@@ -480,7 +539,7 @@ sub print_level3_old_school{
480539 if ( exists_keys($omniscient ,(' level3' ,' tss' ,$level2_ID )) ){
481540 foreach my $feature_level3 ( @{$omniscient -> {' level3' }{' tss' }{$level2_ID }}) {
482541 # _uri_encode_one_feature($feature_level3);
483- $gffout -> write_feature($feature_level3 );
542+ $writer -> write_feature($feature_level3 );
484543 }
485544 }
486545
@@ -489,7 +548,7 @@ sub print_level3_old_school{
489548 if ( exists_keys( $omniscient , (' level3' , ' exon' , $level2_ID ) ) ){
490549 foreach my $feature_level3 ( sort {$a -> start <=> $b -> start} @{$omniscient -> {' level3' }{' exon' }{$level2_ID }}) {
491550 # _uri_encode_one_feature($feature_level3);
492- $gffout -> write_feature($feature_level3 );
551+ $writer -> write_feature($feature_level3 );
493552 }
494553 }
495554
@@ -498,7 +557,7 @@ sub print_level3_old_school{
498557 if ( exists_keys( $omniscient , (' level3' , ' cds' , $level2_ID ) ) ){
499558 foreach my $feature_level3 ( sort {$a -> start <=> $b -> start} @{$omniscient -> {' level3' }{' cds' }{$level2_ID }}) {
500559 # _uri_encode_one_feature($feature_level3);
501- $gffout -> write_feature($feature_level3 );
560+ $writer -> write_feature($feature_level3 );
502561 }
503562 }
504563
@@ -507,7 +566,7 @@ sub print_level3_old_school{
507566 if ( exists_keys($omniscient ,(' level3' ,' tts' ,$level2_ID )) ){
508567 foreach my $feature_level3 ( @{$omniscient -> {' level3' }{' tts' }{$level2_ID }}) {
509568 # _uri_encode_one_feature($feature_level3);
510- $gffout -> write_feature($feature_level3 );
569+ $writer -> write_feature($feature_level3 );
511570 }
512571 }
513572
@@ -518,7 +577,7 @@ sub print_level3_old_school{
518577 if ( exists_keys( $omniscient , (' level3' , $primary_tag_l3 , $level2_ID ) ) ){
519578 foreach my $feature_level3 ( sort {$a -> start <=> $b -> start} @{$omniscient -> {' level3' }{$primary_tag_l3 }{$level2_ID }}) {
520579 # _uri_encode_one_feature($feature_level3);
521- $gffout -> write_feature($feature_level3 );
580+ $writer -> write_feature($feature_level3 );
522581 }
523582 }
524583 }
@@ -613,9 +672,11 @@ sub print_ref_list_feature {
613672
614673 my ($list , $gffout ) = @_ ;
615674
675+ my $writer = AGAT::BatchedGFFWriter-> new($gffout , 1000);
616676 foreach my $feature (@$list ) {
617- $gffout -> write_feature($feature );
677+ $writer -> write_feature($feature );
618678 }
679+ $writer -> flush();
619680}
620681
621682# @Purpose: Print the headers when first time we access the fh
@@ -652,15 +713,15 @@ sub write_headers{
652713
653714sub write_top_features{
654715
655- my ($gffout , $seqid , $hash_sortBySeq_topf , $omniscient ) = @_ ;
716+ my ($writer , $seqid , $hash_sortBySeq_topf , $omniscient ) = @_ ;
656717
657718 if ( exists_keys( $hash_sortBySeq_topf , ($seqid ) ) ){
658719
659720 foreach my $locationid ( sort { ncmp ($a , $b ) } keys %{$hash_sortBySeq_topf -> {$seqid }} ){
660721 my $tag_l1 = $hash_sortBySeq_topf -> {$seqid }{$locationid }{' tag' };
661722 my $id_l1 = $hash_sortBySeq_topf -> {$seqid }{$locationid }{' id' };
662723 my $feature_l1 = $omniscient -> {' level1' }{$tag_l1 }{$id_l1 };
663- $gffout -> write_feature($feature_l1 ); # print feature
724+ $writer -> write_feature($feature_l1 ); # print feature
664725 }
665726 }
666727}
0 commit comments