Skip to content

Commit 950ea8a

Browse files
committed
add package AGAT::BatchedGFFWriter, and use the write_feature of this package allowing to send feature by batch and gain more than 40% writing time
1 parent 157310e commit 950ea8a

1 file changed

Lines changed: 89 additions & 28 deletions

File tree

lib/AGAT/OmniscientO.pm

Lines changed: 89 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,52 @@ convert_omniscient_to_ensembl_style write_top_features prepare_gffout prepare_fi
3232
3333
=cut
3434

35+
# +------------------------------------------------------+
36+
# |+----------------------------------------------------+|
37+
# || Batched Writer Helper Class ||
38+
# |+----------------------------------------------------+|
39+
# +------------------------------------------------------+
40+
41+
# Helper class to batch write_feature calls for better performance
42+
package AGAT::BatchedGFFWriter;
43+
44+
sub new {
45+
my ($class, $gffout, $batch_size) = @_;
46+
$batch_size //= 1000;
47+
return bless {
48+
gffout => $gffout,
49+
batch => [],
50+
batch_size => $batch_size
51+
}, $class;
52+
}
53+
54+
sub write_feature {
55+
my ($self, @features) = @_;
56+
return unless @features;
57+
58+
push @{$self->{batch}}, @features;
59+
60+
# Flush if batch is full
61+
if (@{$self->{batch}} >= $self->{batch_size}) {
62+
$self->flush();
63+
}
64+
}
65+
66+
sub flush {
67+
my ($self) = @_;
68+
return unless @{$self->{batch}};
69+
70+
$self->{gffout}->write_feature(@{$self->{batch}});
71+
$self->{batch} = [];
72+
}
73+
74+
sub DESTROY {
75+
my ($self) = @_;
76+
$self->flush(); # Auto-flush on destruction
77+
}
78+
79+
package AGAT::OmniscientO;
80+
3581
# +------------------------------------------------------+
3682
# |+----------------------------------------------------+|
3783
# || Print Methods ||
@@ -68,6 +114,8 @@ sub prepare_gffout{
68114
$gffout = AGAT::BioperlGFF->new(-fh => \*STDOUT, -type => $CONFIG->{output_format}, -version => $version);
69115
}
70116

117+
dual_print1 "=> Output format will be ".uc($CONFIG->{output_format}).$version.".\n";
118+
71119
return $gffout;
72120
}
73121

@@ -150,6 +198,9 @@ sub print_omniscient_as_gff{
150198
if( defined($args->{output})) {$gffout = $args->{output};} else{ print "Output parameter mandatory to use print_omniscient_as_gff!"; exit; }
151199
# -----------------------------------
152200

201+
# Wrap gffout with batched writer
202+
my $writer = AGAT::BatchedGFFWriter->new($gffout, 1000);
203+
153204
#uri_decode_omniscient($omniscient);
154205

155206
# --------- deal with header --------------
@@ -195,7 +246,7 @@ sub print_omniscient_as_gff{
195246
if ( exists_keys( \%tabix_hash, ($startpos , 'level1') ) ){
196247
foreach my $level1_ID ( sort {$a cmp $b} keys %{$tabix_hash{$startpos}{'level1'}} ){
197248
foreach my $ptag_l1 ( sort {$a cmp $b} keys %{$tabix_hash{$startpos}{'level1'}{$level1_ID}} ){
198-
$gffout->write_feature($omniscient->{'level1'}{$ptag_l1}{$level1_ID}); # print feature
249+
$writer->write_feature($omniscient->{'level1'}{$ptag_l1}{$level1_ID});
199250
}
200251
}
201252
}
@@ -205,7 +256,7 @@ sub print_omniscient_as_gff{
205256
foreach my $ptag_l2 ( sort {$a cmp $b} keys %{$tabix_hash{$startpos}{'level2'}{$level1_ID}{$level2_ID} } ){
206257
foreach my $feature_level2 ( @{$omniscient->{'level2'}{$ptag_l2}{$level1_ID}}) {
207258
if(lc($feature_level2->_tag_value('ID')) eq $level2_ID ){
208-
$gffout->write_feature($feature_level2); # print feature
259+
$writer->write_feature($feature_level2);
209260
last;
210261
}
211262
}
@@ -220,7 +271,7 @@ sub print_omniscient_as_gff{
220271
foreach my $feature_level3 ( @{$omniscient->{'level3'}{$ptag_l3}{$level2_ID} } ) {
221272
# check the start also because spreadfeatures like CDS can share same ID
222273
if(lc($feature_level3->_tag_value('ID')) eq $level3_ID and $startpos == $feature_level3->start()){
223-
$gffout->write_feature($feature_level3); # print feature
274+
$writer->write_feature($feature_level3);
224275
last;
225276
}
226277
}
@@ -229,6 +280,7 @@ sub print_omniscient_as_gff{
229280
}
230281
}
231282
}
283+
# Flush is automatic via DESTROY when $writer goes out of scope
232284
}
233285
else{
234286

@@ -248,22 +300,20 @@ sub print_omniscient_as_gff{
248300
#################
249301
# == LEVEL 1 == # IF not in omniscient do that, otherwise we us within. Make a method for it.
250302
#################
251-
write_top_features($gffout, $seqid, $hash_sortBySeq_topf, $omniscient);
303+
write_top_features($writer, $seqid, $hash_sortBySeq_topf, $omniscient);
252304

253305
foreach my $locationid ( sort { ncmp ($a, $b) } keys %{$hash_sortBySeq->{$seqid} } ){
254306

255307
my $primary_tag_l1 = $hash_sortBySeq->{$seqid}{$locationid}{'tag'};
256308
my $id_tag_key_level1 = $hash_sortBySeq->{$seqid}{$locationid}{'id'};
257-
$gffout->write_feature($omniscient->{'level1'}{$primary_tag_l1}{$id_tag_key_level1}); # print feature
258-
259-
#################
309+
$writer->write_feature($omniscient->{'level1'}{$primary_tag_l1}{$id_tag_key_level1});
260310
# == LEVEL 2 == #
261311
#################
262312
foreach my $primary_tag_l2 (sort {$a cmp $b} keys %{$omniscient->{'level2'}}){ # primary_tag_l2 = mrna or mirna or ncrna or trna etc...
263313

264314
if ( exists_keys( $omniscient, ('level2', $primary_tag_l2, $id_tag_key_level1) ) ){
265315
foreach my $feature_level2 ( sort { ($a->start <=> $b->start) || ($a->end <=> $b->end) || ncmp(lc($a->_tag_value('ID')), lc($b->_tag_value('ID'))) } @{$omniscient->{'level2'}{$primary_tag_l2}{$id_tag_key_level1}}) {
266-
$gffout->write_feature($feature_level2);
316+
$writer->write_feature($feature_level2);
267317

268318
#################
269319
# == LEVEL 3 == #
@@ -278,14 +328,17 @@ sub print_omniscient_as_gff{
278328
else{
279329
warn "Cannot retrieve the parent feature of the following feature: ".gff_string($feature_level2);
280330
}
281-
print_level3_old_school( {omniscient => $omniscient, level2_ID =>$level2_ID, output => $gffout} );
331+
print_level3_old_school( {omniscient => $omniscient, level2_ID =>$level2_ID, output => $writer} );
282332
}
283333
}
284334
}
285335
}
286336
}
287337
}
288338

339+
# Ensure all batched features are written
340+
$writer->flush();
341+
289342
# --------- deal with fasta seq --------------
290343
write_fasta($gffout);
291344
}
@@ -303,6 +356,9 @@ sub print_omniscient_as_match{
303356
if( defined($args->{output})) {$gffout = $args->{output};} else{ print "Output parameter mandatory to use print_omniscient_as_match!"; exit; }
304357
# -----------------------------------
305358

359+
# Wrap gffout with batched writer
360+
my $writer = AGAT::BatchedGFFWriter->new($gffout, 1000);
361+
306362
#uri_decode_omniscient($omniscient);
307363

308364
# --------- deal with header --------------
@@ -318,15 +374,15 @@ sub print_omniscient_as_match{
318374
# == LEVEL 1 == #
319375
#################
320376

321-
write_top_features($gffout, $seqid, $hash_sortBySeq_topf, $omniscient);
377+
write_top_features($writer, $seqid, $hash_sortBySeq_topf, $omniscient);
322378

323379
foreach my $locationid ( sort { ncmp ($a, $b) } keys %{$hash_sortBySeq->{$seqid} } ){
324380

325381
my $primary_tag_l1 = $hash_sortBySeq->{$seqid}{$locationid}{'tag'};
326382
my $id_tag_key_level1 = $hash_sortBySeq->{$seqid}{$locationid}{'id'};
327383

328384
if($primary_tag_l1 =~ "match"){
329-
$gffout->write_feature($omniscient->{'level1'}{$primary_tag_l1}{$id_tag_key_level1}); # print feature
385+
$writer->write_feature($omniscient->{'level1'}{$primary_tag_l1}{$id_tag_key_level1}); # print feature
330386
}
331387
#################
332388
# == LEVEL 2 == #
@@ -337,15 +393,15 @@ sub print_omniscient_as_match{
337393
foreach my $feature_level2 ( sort { ($a->start <=> $b->start) || ($a->end <=> $b->end) || ncmp(lc($a->_tag_value('ID')), lc($b->_tag_value('ID'))) } @{$omniscient->{'level2'}{$primary_tag_l2}{$id_tag_key_level1}}) {
338394

339395
if($primary_tag_l2 =~ "match"){
340-
$gffout->write_feature($feature_level2);
396+
$writer->write_feature($feature_level2);
341397
}
342398
else{
343399
$feature_level2->primary_tag('match');
344400
if( $feature_level2->has_tag('Parent')){
345401
$feature_level2->remove_tag('Parent');
346402
}
347403

348-
$gffout->write_feature($feature_level2);
404+
$writer->write_feature($feature_level2);
349405

350406
#################
351407
# == LEVEL 3 == #
@@ -368,7 +424,7 @@ sub print_omniscient_as_match{
368424
}
369425
$current_start=$end;
370426

371-
$gffout->write_feature($feature_level3);
427+
$writer->write_feature($feature_level3);
372428
}
373429
}
374430
}
@@ -395,6 +451,9 @@ sub print_omniscient_from_level1_id_list {
395451
if( defined($args->{output})) {$gffout = $args->{output};} else{ print "Output parameter mandatory to use print_omniscient_from_level1_id_list!"; exit; }
396452
# -----------------------------------
397453

454+
# Wrap gffout with batched writer
455+
my $writer = AGAT::BatchedGFFWriter->new($gffout, 1000);
456+
398457
#uri_decode_omniscient($omniscient);
399458

400459
# --------- deal with header --------------
@@ -409,7 +468,7 @@ sub print_omniscient_from_level1_id_list {
409468
#################
410469
# == LEVEL 1 == #
411470
#################
412-
write_top_features($gffout, $seqid, $hash_sortBySeq_topf, $omniscient);
471+
write_top_features($writer, $seqid, $hash_sortBySeq_topf, $omniscient);
413472

414473
foreach my $locationid ( sort { ncmp ($a, $b) } keys %{$hash_sortBySeq->{$seqid} } ){
415474

@@ -418,7 +477,7 @@ sub print_omniscient_from_level1_id_list {
418477

419478
#_uri_encode_one_feature($omniscient->{'level1'}{$primary_tag_l1}{$id_tag_key_level1});
420479

421-
$gffout->write_feature($omniscient->{'level1'}{$primary_tag_l1}{$id_tag_key_level1}); # print feature
480+
$writer->write_feature($omniscient->{'level1'}{$primary_tag_l1}{$id_tag_key_level1}); # print feature
422481

423482
#################
424483
# == LEVEL 2 == #
@@ -430,7 +489,7 @@ sub print_omniscient_from_level1_id_list {
430489

431490
#_uri_encode_one_feature($feature_level2);
432491

433-
$gffout->write_feature($feature_level2);
492+
$writer->write_feature($feature_level2);
434493

435494
#################
436495
# == LEVEL 3 == #
@@ -446,7 +505,7 @@ sub print_omniscient_from_level1_id_list {
446505
warn "Cannot retrieve the parent feature of the following feature: ".gff_string($feature_level2);
447506
}
448507

449-
print_level3_old_school( {omniscient => $omniscient, level2_ID =>$level2_ID, output => $gffout} );
508+
print_level3_old_school( {omniscient => $omniscient, level2_ID =>$level2_ID, output => $writer} );
450509

451510
}
452511
}
@@ -466,10 +525,10 @@ sub print_level3_old_school{
466525
# Check we receive a hash as ref
467526
if(ref($args) ne 'HASH'){ warn "Hash Arguments expected for print_level3_old_school. Please check the call.\n";exit; }
468527
# Fill the parameters
469-
my ($omniscient, $level2_ID, $gffout);
528+
my ($omniscient, $level2_ID, $writer);
470529
if( defined($args->{omniscient})) {$omniscient = $args->{omniscient};} else{ print "Omniscient parameter mandatory to use print_level3_old_school!"; exit; }
471530
if( defined($args->{level2_ID})) {$level2_ID = $args->{level2_ID};} else{ print "level2_ID parameter mandatory to use print_level3_old_school!"; exit; }
472-
if( defined($args->{output})) {$gffout = $args->{output};} else{ print "Output parameter mandatory to use print_level3_old_school!"; exit; }
531+
if( defined($args->{output})) {$writer = $args->{output};} else{ print "Output parameter mandatory to use print_level3_old_school!"; exit; }
473532
# -----------------------------------
474533

475534
# -------------- Params --------------
@@ -480,7 +539,7 @@ sub print_level3_old_school{
480539
if ( exists_keys($omniscient,('level3','tss',$level2_ID)) ){
481540
foreach my $feature_level3 ( @{$omniscient->{'level3'}{'tss'}{$level2_ID}}) {
482541
#_uri_encode_one_feature($feature_level3);
483-
$gffout->write_feature($feature_level3);
542+
$writer->write_feature($feature_level3);
484543
}
485544
}
486545

@@ -489,7 +548,7 @@ sub print_level3_old_school{
489548
if ( exists_keys( $omniscient, ('level3', 'exon', $level2_ID) ) ){
490549
foreach my $feature_level3 ( sort {$a->start <=> $b->start} @{$omniscient->{'level3'}{'exon'}{$level2_ID}}) {
491550
#_uri_encode_one_feature($feature_level3);
492-
$gffout->write_feature($feature_level3);
551+
$writer->write_feature($feature_level3);
493552
}
494553
}
495554

@@ -498,7 +557,7 @@ sub print_level3_old_school{
498557
if ( exists_keys( $omniscient, ('level3', 'cds', $level2_ID) ) ){
499558
foreach my $feature_level3 ( sort {$a->start <=> $b->start} @{$omniscient->{'level3'}{'cds'}{$level2_ID}}) {
500559
#_uri_encode_one_feature($feature_level3);
501-
$gffout->write_feature($feature_level3);
560+
$writer->write_feature($feature_level3);
502561
}
503562
}
504563

@@ -507,7 +566,7 @@ sub print_level3_old_school{
507566
if ( exists_keys($omniscient,('level3','tts',$level2_ID)) ){
508567
foreach my $feature_level3 ( @{$omniscient->{'level3'}{'tts'}{$level2_ID}}) {
509568
#_uri_encode_one_feature($feature_level3);
510-
$gffout->write_feature($feature_level3);
569+
$writer->write_feature($feature_level3);
511570
}
512571
}
513572

@@ -518,7 +577,7 @@ sub print_level3_old_school{
518577
if ( exists_keys( $omniscient, ('level3', $primary_tag_l3, $level2_ID) ) ){
519578
foreach my $feature_level3 ( sort {$a->start <=> $b->start} @{$omniscient->{'level3'}{$primary_tag_l3}{$level2_ID}}) {
520579
#_uri_encode_one_feature($feature_level3);
521-
$gffout->write_feature($feature_level3);
580+
$writer->write_feature($feature_level3);
522581
}
523582
}
524583
}
@@ -613,9 +672,11 @@ sub print_ref_list_feature {
613672

614673
my ($list, $gffout) = @_ ;
615674

675+
my $writer = AGAT::BatchedGFFWriter->new($gffout, 1000);
616676
foreach my $feature (@$list) {
617-
$gffout->write_feature($feature);
677+
$writer->write_feature($feature);
618678
}
679+
$writer->flush();
619680
}
620681

621682
# @Purpose: Print the headers when first time we access the fh
@@ -652,15 +713,15 @@ sub write_headers{
652713

653714
sub write_top_features{
654715

655-
my ($gffout, $seqid, $hash_sortBySeq_topf, $omniscient ) = @_;
716+
my ($writer, $seqid, $hash_sortBySeq_topf, $omniscient ) = @_;
656717

657718
if ( exists_keys( $hash_sortBySeq_topf, ($seqid) ) ){
658719

659720
foreach my $locationid ( sort { ncmp ($a, $b) } keys %{$hash_sortBySeq_topf->{$seqid}} ){
660721
my $tag_l1 = $hash_sortBySeq_topf->{$seqid}{$locationid}{'tag'};
661722
my $id_l1 = $hash_sortBySeq_topf->{$seqid}{$locationid}{'id'};
662723
my $feature_l1 = $omniscient->{'level1'}{$tag_l1}{$id_l1};
663-
$gffout->write_feature($feature_l1); # print feature
724+
$writer->write_feature($feature_l1); # print feature
664725
}
665726
}
666727
}

0 commit comments

Comments
 (0)