-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathswift.C
287 lines (228 loc) · 7.17 KB
/
swift.C
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
/****************************************************************
swift.C
Copyright (C)2018 William H. Majoros ([email protected])
This is OPEN SOURCE SOFTWARE governed by the Gnu General Public
License (GPL) version 3, as described at www.opensource.org.
****************************************************************/
#include <iostream>
#include "BOOM/String.H"
#include "BOOM/CommandLine.H"
#include "BOOM/File.H"
#include "BOOM/Regex.H"
#include "BOOM/Exceptions.H"
#include "BOOM/GSL/BetaDistribution.H"
#include "BOOM/GSL/Random.H"
#include "BOOM/VectorSorter.H"
#include "BOOM/File.H"
#include "Replicates.H"
#include "SwiftSample.H"
using namespace std;
using namespace BOOM;
const int PSEUDOCOUNT=1;
class Application {
Replicates DNA, RNA;
Vector<SwiftSample> samples;
void readReps(const Vector<String> &fields,int start,Replicates &);
void skipLines(int num,File &);
float getMedian();
void getCI(float percent,float &left,float &right);
void getP_reg(float lambda,float &leftP,float &rightP,float &P);
void addPseudocounts();
void save(File &) const;
public:
Application();
int main(int argc,char *argv[]);
bool loadInputs(File &,String &variantID);
void performSampling(int numSamples,float concentration);
void reportSummary(const String &id,float lambda);
};
int main(int argc,char *argv[])
{
try {
Application app;
return app.main(argc,argv);
}
catch(const char *p) { cerr << p << endl; }
catch(const string &msg) { cerr << msg.c_str() << endl; }
catch(const exception &e)
{cerr << "STL exception caught in main:\n" << e.what() << endl;}
catch(...) { cerr << "Unknown exception caught in main" << endl; }
return -1;
}
Application::Application()
{
GSL::Random::randomize();
}
int Application::main(int argc,char *argv[])
{
// Process command line
CommandLine cmd(argc,argv,"s:");
if(cmd.numArgs()!=5)
throw String("\n\
swift <input.txt> <concentration> <lambda> <first-last> <#samples>\n\
variant indices are 0-based and inclusive\n\
1.25 is recommended for lambda (min effect size)\n\
100 is recommended for the concentration\n\
1000 is recommended for #samples\n\
optional: -s mcmc.txt = save samples in mcmc.txt\n\
");
const String infile=cmd.arg(0);
const float concentration=cmd.arg(1).asFloat();
const float lambda=cmd.arg(2).asFloat();
const String variantRange=cmd.arg(3);
const int numSamples=cmd.arg(4).asInt();
if(lambda<1.0) throw "lambda must be >= 1";
String sampleFilename=cmd.optParm('s');
// Get ready to run on input file
Regex reg("(\\d+)-(\\d+)");
if(!reg.match(variantRange)) throw "can't parse variant index range";
const int firstVariant=reg[1].asInt();
const int lastVariant=reg[2].asInt();
File f(infile);
skipLines(firstVariant,f);
String id;
File *sampleFile=sampleFilename.empty() ? NULL :
new File(sampleFilename,"w");
for(int i=firstVariant ; i<=lastVariant ; ++i) {
DNA.clear(); RNA.clear(); samples.clear();
// Load inputs
if(!loadInputs(f,id)) break;
// Draw samples
performSampling(numSamples,concentration);
// Report median and 95% CI
reportSummary(id,lambda);
// Save samples if requested
if(sampleFile) save(*sampleFile);
}
delete sampleFile;
return 0;
}
void Application::skipLines(int num,File &file)
{
for(int i=0 ; i<num ; ++i) file.getline();
}
void Application::readReps(const Vector<String> &fields,int countField,
Replicates &reps)
{
const int numReps=fields[countField].asInt();
if(numReps<1) throw "Invalid number of replicates";
const int begin=countField+1;
const int end=begin+numReps*2;
for(int i=begin ; i<end ; i+=2) {
const int ref=fields[i].asInt(), alt=fields[i+1].asInt();
reps.add(Replicate(ref,alt));
}
}
bool Application::loadInputs(File &f,String &variantID)
{
if(f.eof()) throw "End of file";
String line=f.getline();
line.trimWhitespace();
if(line.isEmpty()) return false;
Vector<String> fields;
line.getFields(fields);
if(fields.size()<7) throw line+" : Not enough fields";
variantID=fields[0];
readReps(fields,1,DNA);
const int numDnaReps=fields[1].asInt();
readReps(fields,2+numDnaReps*2,RNA);
DNA.collapse();
RNA.collapse();
addPseudocounts();
return true;
}
void Application::performSampling(int numSamples,float conc)
{
if(DNA.size()!=1 || RNA.size()!=1) throw "Wrong number of replicates";
const int a=DNA[0].getAlt(), b=DNA[0].getRef();
const int k=RNA[0].getAlt(), m=RNA[0].getRef();
for(int i=0 ; i<numSamples ; ++i) {
// Sample p from P(p|a,b)
GSL::BetaDistribution beta1(a+1,b+1); // posterior with uniform prior
float p;
do { p=beta1.random(); } while(p==0.0 || p==1.0);
// Sample q from P(q|p,k,m), using beta prior parameterized by mean & conc
// ### TRYING THE ADAPTIVE CONCENTRATION METHOD:
//const float c=conc;
//const float c=5.5/p;
const float c=5.5/p;
// ###
const float alpha=p*(c-2); // prior
const float beta=(1-p)*(c-2); // prior
GSL::BetaDistribution beta2(alpha+k,beta+m); // posterior
float q;
do { q=beta2.random(); } while(q==0.0 || q==1.0);
samples.push_back(SwiftSample(p,q));
}
}
float Application::getMedian()
{
// PRECONDITION: samples have been sorted by theta
int n=samples.size();
if(n<2) throw "Too few samples to identify median";
int mid=n/2;
float median;
if(n%2==0)
median=(samples[mid-1].getTheta()+samples[mid].getTheta())/2.0;
else
median=samples[mid].getTheta();
}
void Application::addPseudocounts()
{
if(DNA[0].getAlt()==0 || DNA[0].getRef()==0 ||
RNA[0].getAlt()==0 || RNA[0].getRef()==0) {
DNA[0].addPseudocount(PSEUDOCOUNT);
RNA[0].addPseudocount(PSEUDOCOUNT);
}
}
void Application::getCI(float percent,float &left,float &right)
{
// PRECONDITION: samples have been sorted by theta
float halfAlpha=(1.0-percent)/2.0;
const int n=samples.size();
const int countIn=int(n*halfAlpha+5.0/9.0);
left=samples[countIn].getTheta();
right=samples[n-countIn].getTheta();
}
void Application::getP_reg(float lambda,float &leftP,float &rightP,float &P)
{
const int n=samples.size();
float invLambda=1.0/lambda;
int numLess=0;
int numGreater=0;
for(int i=0 ; i<n ; ++i) {
if(samples[i].getTheta()<invLambda) ++numLess;
if(samples[i].getTheta()>lambda) ++numGreater;
leftP=float(numLess)/float(n);
rightP=float(numGreater)/float(n);
P=leftP>rightP ? leftP : rightP;
}
}
void Application::reportSummary(const String &id,float lambda)
{
// Sort the samples
SwiftSampleComparator cmp;
VectorSorter<SwiftSample> sorter(samples,cmp);
sorter.sortAscendInPlace();
// Get the median
const float median=getMedian();
// Get the 95% CI
float left, right;
getCI(0.95,left,right);
// Get p-value-like statistics
float leftP, rightP, P_reg;
getP_reg(lambda,leftP,rightP,P_reg);
// Generate output
cout<<id<<"\t"<<median<<"\t"<<left<<"\t"<<right<<"\t"<<P_reg<<endl;
}
void Application::save(File &f) const
{
const int n=samples.size();
for(int i=0 ; i<n ; ++i) {
const SwiftSample &sample=samples[i];
float theta=int(sample.getTheta()*10000+5.0/9.0)/10000.0;
f.print(String(theta));
if(i+1<n) f.print("\t");
}
f.print("\n");
}