-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathstoreStats.pl
executable file
·129 lines (112 loc) · 4.16 KB
/
storeStats.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/env perl
# Computes & stores the codon weight matrix from an MPSA.
#
# Samuel S. Shepard ([email protected])
# 2022, Centers for Disease Control & Prevention
use strict;
use warnings;
use English qw( -no_match_vars);
use Storable qw(store_fd store);
use Getopt::Long;
my ( $outfile, $delim, $fieldSet );
GetOptions(
'output|O=s' => \$outfile,
'delim|D=s' => \$delim,
'field|F=s' => \$fieldSet
);
if ( -t STDIN && scalar(@ARGV) != 1 ) {
die( "Usage:\n\tperl $PROGRAM_NAME <nts.fasta> [options]\n"
. "\t\t--output|-O <file.sto>\tOutput file for storable object. Default: STDOUT\n"
. "\t\t--delim|-D <CHAR>\tDelimiter for header fields. Default delim is '|'.\n"
. "\t\t--field|-F <STR>\tComma-delimited set of fields to use for group. Default: no group.\n"
. "\n" );
}
my $numberSelected = 0;
my @fields = ();
if ( defined $fieldSet ) {
@fields = split( ',', $fieldSet );
$numberSelected = scalar(@fields);
foreach my $x (@fields) {
if ( $x == 0 ) {
die("$PROGRAM_NAME ERROR: field must be specified.\n");
} elsif ( $x < 0 ) {
die("$PROGRAM_NAME ERROR: field must be a positive number.\n");
}
}
foreach my $x ( 0 .. ( $numberSelected - 1 ) ) {
$fields[$x]--;
}
}
if ( !defined $delim ) {
$delim = '|';
} elsif ( $delim eq q{} ) {
die("$PROGRAM_NAME ERROR: No delimiter argument detected.\n");
} elsif ( length($delim) > 1 ) {
die("$PROGRAM_NAME ERROR: single character delimiter expected instead of '$delim'.\n");
}
my %counts = ();
local $RS = ">";
while ( my $fasta_record = <> ) {
chomp($fasta_record);
my @lines = split( /\r\n|\n|\r/smx, $fasta_record );
my $id = shift(@lines);
my $sequence = lc( join( q{}, @lines ) );
my $length = length($sequence);
if ( $length == 0 ) {
next;
} elsif ( $length % 3 != 0 ) {
die("$PROGRAM_NAME:\tNot a codon alignment, length $length not divisible by 3!\n");
} else {
if ( $numberSelected > 0 ) {
my @values = split( /\Q$delim\E/smx, $id );
my $numberFound = scalar(@values);
if ( $numberFound < $numberSelected ) {
die("$PROGRAM_NAME ERROR: non-existant field specified. Wanted $numberSelected but found $numberFound\n");
}
$id = join( $delim, ( @values[@fields] ) );
}
my $group = $numberSelected > 0 ? $id : '__NIL__';
$counts{$group}{$sequence}++;
}
}
my @groups = keys(%counts);
my %codonData = ();
if ( scalar @groups == 1 && $groups[0] eq '__NIL__' ) {
my $group = '__NIL__';
my @patterns = keys( %{ $counts{$group} } );
foreach my $pattern (@patterns) {
my $length = length($pattern);
my $count = $counts{$group}{$pattern};
## no critic (ControlStructures::ProhibitCStyleForLoops)
for ( my $pos = 0; $pos < $length; $pos += 3 ) {
# zero based codon number
my $codonNumber = int( $pos / 3 );
my $codonStart = $codonNumber * 3;
my $codon = substr( $pattern, $codonStart, 3 );
if ( $codon =~ /[.nN-]/smx ) { next; }
$codonData{$codonNumber}{$codon} += $count;
}
}
} else {
foreach my $group ( keys(%counts) ) {
my @patterns = keys( %{ $counts{$group} } );
foreach my $pattern (@patterns) {
my $length = length($pattern);
my $count = $counts{$group}{$pattern};
## no critic (ControlStructures::ProhibitCStyleForLoops)
for ( my $pos = 0; $pos < $length; $pos += 3 ) {
# zero based codon number
my $codonNumber = int( $pos / 3 );
my $codonStart = $codonNumber * 3;
my $codon = substr( $pattern, $codonStart, 3 );
if ( $codon =~ /[.nN-]/smx ) { next; }
$codonData{$group}{$codonNumber}{$codon} += $count;
}
}
}
}
if ( defined($outfile) ) {
store( \%codonData, $outfile ) or die("Cannot write to '$outfile'.\n");
} else {
store_fd( \%codonData, *STDOUT ) or die("Can't write to STDOUT.\n");
}