-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathdownsample.py
executable file
·142 lines (127 loc) · 4.54 KB
/
downsample.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/usr/bin/env python
#=========================================================================
# This is OPEN SOURCE SOFTWARE governed by the Gnu General Public
# License (GPL) version 3, as described at www.opensource.org.
# Copyright (C)2023 William H. Majoros <[email protected]>
#=========================================================================
from __future__ import (absolute_import, division, print_function,
unicode_literals, generators, nested_scopes, with_statement)
from builtins import (bytes, dict, int, list, object, range, str, ascii,
chr, hex, input, next, oct, open, pow, round, super, filter, map, zip)
# The above imports should allow this program to run in both Python 2 and
# Python 3. You might need to update your version of module "future".
import sys
import gzip
import math
import random
from FastaReader import FastaReader
from FastaWriter import FastaWriter
import ProgramName
from Rex import Rex
rex=Rex()
RANGE=0.99
def getRange(pct,sortedThetas):
M=len(sortedThetas)
bothTails=(1-pct)*M
eachTail=int(bothTails/2)
lower=sortedThetas[eachTail]
upper=sortedThetas[M-1-eachTail]
return (lower,upper)
def computeDistribution(thetas,numBins):
sortedThetas=sorted(thetas)
M=len(sortedThetas)
(lower,upper)=getRange(RANGE,sortedThetas)
#print("range=",lower,upper)
thetaRange=upper-lower
binSize=thetaRange/numBins
distr=[]
pos=0
for i in range(1,numBins+1):
binEnd=lower+i*binSize
if(i==numBins): binEnd=sortedThetas[M-1]+1
(count,pos)=countInBin(pos,sortedThetas,binEnd)
p=float(count)/float(M)
distr.append([p,binEnd])
#print("distr=",distr)
return distr
def countInBin(pos,sortedThetas,binEnd):
M=len(sortedThetas)
firstPos=pos
while(pos<M and sortedThetas[pos]<binEnd): pos+=1
count=pos-firstPos
return (count,pos)
def loadThetas(thetasFile):
IN=open(thetasFile,"rt")
thetas=[]
for line in IN:
if(not rex.find("\d",line)): continue
theta=float(line)
thetas.append(math.log(theta))
IN.close()
return thetas
def computeAccept(distr,N,M,B):
alpha=[]
for i in range(B):
(pi,binEnd)=distr[i]
a=min(1,float(N)/(float(M)*float(B)*pi))
alpha.append(a)
return alpha
def nextCountsLine(line,DNA_REPS,RNA_REPS):
fields=line.rstrip().split()
fields=[int(x) for x in fields]
DNA=fields[:DNA_REPS]; RNA=fields[DNA_REPS:]
sumDNA=sum(DNA); sumRNA=sum(RNA)
naiveTheta=(sumRNA+RNA_REPS)/(sumDNA+DNA_REPS)
return math.log(naiveTheta)
def getBin(distr,theta):
B=len(distr)
for i in range(B):
(prop,end)=distr[i]
if(theta<end): return i
return B-1
#raise Exception("No bin found for theta=",theta)
def countCounts(countsFile):
IN_COUNTS=gzip.open(countsFile,"rt")
header=IN_COUNTS.readline()
M=0
for line in IN_COUNTS: M+=1
return M
#=========================================================================
# main()
#=========================================================================
if(len(sys.argv)!=8):
exit(ProgramName.get()+" <in:naive-thetas.txt> <in:fasta.gz> <in:counts.txt.gz> <#bins> <desired-sample-size> <out-dir> <train|test|validation>")
(thetasFile,fastaFile,countsFile,numBins,N,outDir,fileLabel)=sys.argv[1:]
numBins=int(numBins)
N=int(N)
outCountFile=outDir+"/"+fileLabel+"-counts.txt.gz"
outFastaFile=outDir+"/"+fileLabel+".fasta.gz"
# Compute empirical distribution of theta
thetas=loadThetas(thetasFile)
distr=computeDistribution(thetas,numBins)
# Compute acceptance probability for each bin
M=countCounts(countsFile)
alpha=computeAccept(distr,N,M,numBins)
# Create output files
OUT_COUNTS=gzip.open(outCountFile,"wt")
OUT_FASTA=gzip.open(outFastaFile,"wt")
# Downsample the data
IN_COUNTS=gzip.open(countsFile,"rt")
header=IN_COUNTS.readline()
if(not rex.find("DNA=(\d+)\s+RNA=(\d+)",header)):
raise Exception("Can't parse header in counts file: "+header)
print(header,end="",file=OUT_COUNTS)
DNA_REPS=int(rex[1]); RNA_REPS=int(rex[2])
fastaReader=FastaReader(fastaFile)
fastaWriter=FastaWriter()
for line in IN_COUNTS:
theta=nextCountsLine(line,DNA_REPS,RNA_REPS)
pair=fastaReader.nextSequence() # returns None at eof
if(pair is None): raise Exception("Reached eof of FASTA file prematurely")
(defline,sequence)=pair
b=getBin(distr,theta)
accept=alpha[b]
if(random.random()<=accept):
print(line,end="",file=OUT_COUNTS)
fastaWriter.addToFasta(defline,sequence,OUT_FASTA)
IN_COUNTS.close(); OUT_COUNTS.close(); OUT_FASTA.close()