Skip to content

Commit 32eacdb

Browse files
committed
undo: remove create_reference_dist.py
1 parent cbe6209 commit 32eacdb

1 file changed

Lines changed: 155 additions & 0 deletions

File tree

guacamole/create_reference_dist.py

Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
import os
2+
import sys
3+
from datetime import datetime
4+
from time import localtime, strftime
5+
import pandas as pd
6+
import numpy as np
7+
from math import floor
8+
from numpy.random import uniform
9+
import Bio.SeqIO as SeqIO
10+
from multiprocessing import Pool
11+
from functools import partial
12+
from glob import glob
13+
import argparse
14+
15+
16+
def print_time():
17+
now = datetime.now()
18+
current_time = now.strftime("%H:%M:%S")
19+
print("Current Time =", current_time)
20+
21+
22+
def parsing(sequence): # creates list with comulatative GC-content
23+
gc = [0]
24+
valid_nucleotides = [0]
25+
for base in sequence:
26+
if base in ['G', 'C']:
27+
gc.append(gc[-1] + 1)
28+
else:
29+
gc.append(gc[-1])
30+
if base in ['G', 'C', 'T', 'A']:
31+
valid_nucleotides.append(valid_nucleotides[-1] + 1)
32+
else:
33+
valid_nucleotides.append(valid_nucleotides[-1])
34+
35+
return gc, valid_nucleotides
36+
37+
38+
def create_refdist(seqid, kmer_len, paths, nbin=100, sampling=50):
39+
record_dict = SeqIO.index_db('fasta_index.sql', paths, 'fasta')
40+
seq = record_dict[seqid].seq
41+
occ = np.zeros(nbin + 1)
42+
gc_list, valid_nucleotides_list = parsing(seq)
43+
for i in range(len(seq) - kmer_len + 1)[::sampling]:
44+
gc = gc_list[i + kmer_len] - gc_list[i]
45+
valid_nucleotides = valid_nucleotides_list[i + kmer_len] - valid_nucleotides_list[i]
46+
if valid_nucleotides == 0:
47+
continue
48+
gc_content = floor((nbin / valid_nucleotides) * (gc + uniform()))
49+
if gc_content > nbin:
50+
continue
51+
occ[gc_content] += 1
52+
return occ
53+
54+
55+
def create_refdist_insert(seqid, kmer_len, paths, nbin=100, sampling=50, insert=0):
56+
record_dict = SeqIO.index_db('fasta_index.sql', paths, 'fasta')
57+
seq = record_dict[seqid].seq
58+
occ = np.zeros(nbin + 1)
59+
gc_list, valid_nucleotides_list = parsing(seq)
60+
for i in range(len(seq) - 2*kmer_len - insert + 1)[::sampling]:
61+
gc_1 = gc_list[i + kmer_len] - gc_list[i]
62+
gc_2 = gc_list[i + 2*kmer_len + insert] - gc_list[i + kmer_len + insert]
63+
valid_nucleotides_1 = valid_nucleotides_list[i + kmer_len] - valid_nucleotides_list[i]
64+
valid_nucleotides_2 = valid_nucleotides_list[i + 2*kmer_len + insert] - valid_nucleotides_list[i + kmer_len + insert]
65+
valid_nucleotides = valid_nucleotides_1 + valid_nucleotides_2
66+
if valid_nucleotides == 0:
67+
continue
68+
gc = gc_1 + gc_2
69+
gc_content = floor((nbin / valid_nucleotides) * (gc + uniform()))
70+
if gc_content > nbin:
71+
continue
72+
occ[gc_content] += 1
73+
return occ
74+
75+
76+
def main():
77+
78+
parser = argparse.ArgumentParser(
79+
description='Create reference distributions with given read length and possibly fragment length to run GuaCAMOLE')
80+
81+
parser.add_argument('--lib_path', metavar='lib_path', type=str, help='Path to Kraken2 database')
82+
parser.add_argument('--read_len', metavar='read_len', type=int, help='read length')
83+
parser.add_argument('--ncores', metavar='ncores', type=int, help='number of threads')
84+
parser.add_argument('--fragment_len', metavar='fragment_len', type=int, help='fragment length', default=None)
85+
args = parser.parse_args()
86+
lib_path = args.lib_path
87+
read_len = args.read_len
88+
ncores = args.ncores
89+
fragment_len = args.fragment_len
90+
old_gc_dist = None
91+
92+
if fragment_len is not None:
93+
insert = fragment_len - 2 * read_len
94+
95+
sampling = 50
96+
nbin = 100
97+
time = strftime("%m-%d-%Y %H:%M:%S", localtime())
98+
sys.stdout.write("PROGRAM START TIME: " + time + '\n')
99+
if old_gc_dist is not None:
100+
old_df = pd.read_csv(old_gc_dist, index_col=0)
101+
102+
os.chdir(lib_path)
103+
seqid2taxid = pd.read_csv('seqid2taxid.map', sep='\t', header=None)
104+
fasta_paths = glob('library/*/*.fna')
105+
# SeqIO.index_db seems to be sensitive to file order, so make sure it is reproducible
106+
fasta_paths.sort()
107+
print('Indexing fasta files...')
108+
record_dict = SeqIO.index_db('fasta_index.sql', fasta_paths, 'fasta')
109+
110+
print("Indexing complete, querying sequence IDs")
111+
seqids = [x for x in np.array(seqid2taxid[0]) if x in record_dict.keys()]
112+
113+
if old_gc_dist is not None:
114+
old_dist_seqids = np.array(old_df['seqids'])
115+
old_seqids = [x for x in seqids if x in old_dist_seqids]
116+
seqids = [x for x in seqids if x not in old_dist_seqids]
117+
print("Existing GC Distribution file provided!")
118+
print("Found GC distributions for " + str(len(old_seqids)) + " genome sequences in provided file")
119+
120+
print("Computing GC distributions for " + str(len(seqids)) + " genome sequences")
121+
if fragment_len is not None:
122+
with Pool(ncores) as pool:
123+
dist_seqids = pool.map(
124+
partial(create_refdist_insert, kmer_len=read_len, nbin=nbin, sampling=sampling, paths=fasta_paths,
125+
insert=insert),
126+
seqids
127+
)
128+
else:
129+
with Pool(ncores) as pool:
130+
dist_seqids = pool.map(
131+
partial(create_refdist, kmer_len=read_len, nbin=nbin, sampling=sampling, paths=fasta_paths),
132+
seqids
133+
)
134+
135+
time = strftime("%m-%d-%Y %H:%M:%S", localtime())
136+
sys.stdout.write("PROGRAM END TIME: " + time + '\n')
137+
138+
139+
in_seqs = seqid2taxid.iloc[:, 0].isin(seqids)
140+
txids = seqid2taxid.loc[in_seqs, 1]
141+
142+
df = pd.DataFrame(dist_seqids)
143+
df['taxid'] = np.array(txids)
144+
df['seqids'] = np.array(seqids)
145+
df.columns = [str(i) for i in df.columns]
146+
if old_gc_dist is not None:
147+
df = pd.concat([old_df.loc[old_df['seqids'].isin(old_seqids), :], df], axis=0)
148+
if fragment_len is not None:
149+
df.to_csv('gc_bin_' + str(nbin) + '_kmer_'+ str(read_len) + '_insert_' + str(insert) + '_dist.csv')
150+
else:
151+
df.to_csv('gc_bin_' + str(nbin) + '_kmer_'+ str(read_len) + '_dist.csv')
152+
153+
if __name__ == "__main__":
154+
main()
155+

0 commit comments

Comments
 (0)