Skip to content

Commit 080890b

Browse files
author
iainrb
committed
Classes to parse EGT and create FCR files
1 parent f4768bd commit 080890b

18 files changed

+1156
-168
lines changed

Egt.cpp

+274
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,274 @@
1+
2+
// Egt.cpp
3+
//
4+
// Author: Iain Bancarz <[email protected]>
5+
//
6+
// Copyright (c) 2014 Genome Research Ltd.
7+
//
8+
// Redistribution and use in source and binary forms, with or without
9+
// modification, are permitted provided that the following conditions are met:
10+
// 1. Redistributions of source code must retain the above copyright notice,
11+
// this list of conditions and the following disclaimer.
12+
// 2. Redistributions in binary form must reproduce the above copyright
13+
// notice, this list of conditions and the following disclaimer in the
14+
// documentation and/or other materials provided with the distribution.
15+
// 3. Neither the name of Genome Research Ltd nor the names of the
16+
// contributors may be used to endorse or promote products derived from
17+
// software without specific prior written permission.
18+
//
19+
// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20+
// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21+
// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22+
// IN NO EVENT SHALL GENOME RESEARCH LTD. BE LIABLE FOR ANY DIRECT, INDIRECT,
23+
// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24+
// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
25+
// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26+
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27+
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28+
// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29+
//
30+
31+
/*
32+
* Egt is a class to parse Illumina EGT cluster files.
33+
*
34+
* This is a port of the EGT Python class in zCall to C++.
35+
* There is no public documentation for the EGT binary file format.
36+
* (However, the zCall EGT class is claimed to originate from Illumina.)
37+
*
38+
* Repository for zCall: https://github.com/wtsi-npg/zCall
39+
*
40+
* Native EGT format stores coordinates as (R, Theta) polar coordinates
41+
* Python EGT class converts to Cartesian for storage in memory
42+
* This class stores as polar
43+
* (More efficient than converting and storing as Cartesian, since both are
44+
needed for FCR output)
45+
*
46+
* IMPORTANT: Illumina defines its own polar coordinates:
47+
* - Theta is rescaled, such that 1 unit = pi/2 radians
48+
* - R is equal to x+y, *not* sqrt(x^2 + y^2)
49+
* - R is a "Manhattan distance", not Euclidean distance
50+
*/
51+
52+
#include <string>
53+
#include "Egt.h"
54+
55+
using namespace std;
56+
57+
Egt::Egt(bool verbose)
58+
{
59+
this->verbose = verbose;
60+
GENOTYPES_PER_SNP = 3;
61+
PARAMS_PER_SNP = 12;
62+
NUMERIC_BYTES = 4;
63+
ENTRIES_IN_RECORD = 30;
64+
BYTES_IN_RECORD = NUMERIC_BYTES * ENTRIES_IN_RECORD;
65+
ENTRIES_TO_USE = 15;
66+
}
67+
68+
void Egt::open(string filename)
69+
{
70+
this->filename = filename;
71+
ifstream file;
72+
73+
file.open(filename.c_str());
74+
if (!file) {
75+
cerr << "Can't open file: " << filename << endl << flush;
76+
exit(1);
77+
}
78+
// read header data
79+
readHeader(file);
80+
readPreface(file);
81+
if (verbose) {
82+
printHeader();
83+
printPreface();
84+
}
85+
// read cluster data
86+
// expected cluster positions, in polar coordinates (R, Theta) for (AA,AB,BB)
87+
// Order of params is: devRAA, devRAB, devRBB, meanRAA, meanRAB, meanRBB, devThetaAA, devThetaAB, devThetaBB, meanThetaAA, meanThetaAB, meanThetaBB
88+
// Can't use a multidimensional array because we don't know snpTotal at compile time; instead use a pseudo-multidimensional array such that the (i,j)th value is (i*WIDTH + j)
89+
counts = new int[GENOTYPES_PER_SNP*snpTotal]; // nAA, nAB, nBB
90+
params = new float[PARAMS_PER_SNP*snpTotal];
91+
char *block = new char[BYTES_IN_RECORD];
92+
for (int i=0; i<snpTotal; i++) {
93+
file.read(block, BYTES_IN_RECORD);
94+
int *ints = bytesToInts(block, 0, GENOTYPES_PER_SNP);
95+
float *floats = bytesToFloats(block, GENOTYPES_PER_SNP, ENTRIES_IN_RECORD);
96+
for (int j=0;j<GENOTYPES_PER_SNP;j++)
97+
counts[i*GENOTYPES_PER_SNP + j] = ints[j];
98+
for (int j=0;j<PARAMS_PER_SNP;j++)
99+
params[i*PARAMS_PER_SNP + j] = floats[j];
100+
delete floats;
101+
delete ints;
102+
}
103+
delete block;
104+
snpNames = new string[snpTotal];
105+
readSNPNames(file, snpNames);
106+
file.close();
107+
}
108+
109+
void Egt::open(char *filename)
110+
{
111+
string f = filename;
112+
open(f);
113+
}
114+
115+
116+
int* Egt::bytesToInts(char block[], int start, int end) {
117+
// convert a section of a byte array into ints
118+
// start, end indices refer to positions in the array of ints (not bytes)
119+
int *results = new int[end - start];
120+
numericConverter converter;
121+
for (int i=start;i<end;i++) {
122+
for (int j=0;j<NUMERIC_BYTES;j++) {
123+
converter.ncChar[j] = block[i*NUMERIC_BYTES + j];
124+
}
125+
results[i-start] = converter.ncInt;
126+
}
127+
return results;
128+
}
129+
130+
float* Egt::bytesToFloats(char block[], int start, int end) {
131+
// convert a section of a byte array into floats
132+
// start, end indices refer to positions in the array of floats (not bytes)
133+
float *results = new float[end - start];
134+
numericConverter converter;
135+
for (int i=start;i<end;i++) {
136+
for (int j=0;j<NUMERIC_BYTES;j++) {
137+
converter.ncChar[j] = block[i*NUMERIC_BYTES + j];
138+
}
139+
results[i-start] = converter.ncFloat;
140+
}
141+
return results;
142+
}
143+
144+
void Egt::getClusters(long index, float snp_params[]) {
145+
// return all (theta, R) params for given position in the SNP manifest
146+
// includes s.d. and mean of (r, theta) for AA, AB, BB
147+
int start = index*PARAMS_PER_SNP;
148+
for (int j=0; j < PARAMS_PER_SNP; j++) {
149+
snp_params[j] = this->params[start + j];
150+
}
151+
}
152+
153+
void Egt::getMeanR(long index, float means[]) {
154+
// find mean polar radius for AA, AB, BB at given index
155+
int start = index*PARAMS_PER_SNP + GENOTYPES_PER_SNP;
156+
for (int j=0; j < GENOTYPES_PER_SNP; j++) {
157+
means[j] = this->params[start + j];
158+
}
159+
}
160+
161+
void Egt::getMeanTheta(long index, float means[]) {
162+
// find mean polar angle for AA, AB, BB at given index
163+
int start = index*PARAMS_PER_SNP + 3*GENOTYPES_PER_SNP;
164+
for (int j=0; j < GENOTYPES_PER_SNP; j++) {
165+
means[j] = this->params[start + j];
166+
}
167+
}
168+
169+
numericConverter Egt::getNextConverter(ifstream &file) {
170+
// convenience method to read the next few bytes into a union
171+
// can then use the union for numeric conversion
172+
char * buffer;
173+
buffer = new char[NUMERIC_BYTES];
174+
file.read(buffer, NUMERIC_BYTES);
175+
numericConverter converter;
176+
for (int i=0; i<NUMERIC_BYTES; i++) {
177+
converter.ncChar[i] = buffer[i];
178+
}
179+
delete buffer;
180+
return converter;
181+
}
182+
183+
float Egt::readFloat(ifstream &file) {
184+
float result;
185+
numericConverter converter = getNextConverter(file);
186+
result = converter.ncFloat;
187+
return result;
188+
189+
}
190+
191+
void Egt::readHeader(ifstream &file)
192+
{
193+
// populate instance variables with header values
194+
file.seekg(0); // set read position to zero, if not already there
195+
fileVersion = readInteger(file);
196+
gcVersion = readString(file);
197+
clusterVersion = readString(file);
198+
callVersion = readString(file);
199+
normalizationVersion = readString(file);
200+
dateCreated = readString(file);
201+
mode = file.get();
202+
manifest = readString(file);
203+
}
204+
205+
int Egt::readInteger(ifstream &file)
206+
{
207+
int result;
208+
numericConverter converter = getNextConverter(file);
209+
result = converter.ncInt;
210+
return result;
211+
}
212+
213+
void Egt::readPreface(ifstream &file) {
214+
// read the 'preface' from the body of an EGT file
215+
// assumes file is positioned at start of the body
216+
dataVersion = readInteger(file);
217+
opa = readString(file);
218+
snpTotal = readInteger(file);
219+
}
220+
221+
void Egt::readSNPNames(ifstream &file, string names[]) {
222+
// read SNP names from an EGT file
223+
// assumes file is positioned at end of cluster (mean, sd) data
224+
int pos = file.tellg();
225+
file.seekg(pos + 13 * snpTotal); // skip SNP quality scores
226+
for (int i=0;i<snpTotal;i++) {
227+
// skip genotype scores
228+
// length of strings is unknown, so must read each one and discard it
229+
readString(file);
230+
}
231+
for (int i=0;i<snpTotal;i++) {
232+
names[i] = readString(file);
233+
}
234+
}
235+
236+
string Egt::readString(ifstream &file) {
237+
// EGT string format is as follows:
238+
// - First byte is a *signed* char encoding the string length
239+
// - Subsequent bytes contain the string
240+
// Total bytes read is (length encoded in first byte)+1 -- at most 128
241+
int length = int(file.get()); // get a single byte
242+
if (not file.good())
243+
throw("Cannot read length from EGT file, file state is not good");
244+
else if (length <= 0)
245+
throw("Illegal string length in EGT file");
246+
char * buffer;
247+
buffer = new char[length+1];
248+
buffer[length] = '\0'; // ensure buffer ends with a null character
249+
file.read(buffer, length);
250+
if (not file.good())
251+
throw("Cannot read string from EGT file, file state is not good");
252+
string result = string(buffer);
253+
delete buffer;
254+
return result;
255+
}
256+
257+
void Egt::printHeader() {
258+
// convenience method to print file header
259+
cout << "FILE_VERSION " << fileVersion << endl;
260+
cout << "GC_VERSION " << gcVersion << endl;
261+
cout << "CLUSTER_VERSION " << clusterVersion << endl;
262+
cout << "CALL_VERSION " << callVersion << endl;
263+
cout << "NORMALIZATION_VERSION " << normalizationVersion << endl;
264+
cout << "DATE_CREATED " << dateCreated << endl;
265+
cout << "MODE " << (int) mode << endl;
266+
cout << "MANIFEST " << manifest << endl;
267+
}
268+
269+
void Egt::printPreface() {
270+
// convenience method to print preface of "main" data section
271+
cout << "DATA_VERSION " << dataVersion << endl;
272+
cout << "OPA " << opa << endl;
273+
cout << "TOTAL_SNPS " << snpTotal << endl;
274+
}

Egt.h

+99
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
//
2+
// Egt.h
3+
//
4+
// Author: Iain Bancarz <[email protected]>
5+
//
6+
// Copyright (c) 2014 Genome Research Ltd.
7+
//
8+
// Redistribution and use in source and binary forms, with or without
9+
// modification, are permitted provided that the following conditions are met:
10+
// 1. Redistributions of source code must retain the above copyright notice,
11+
// this list of conditions and the following disclaimer.
12+
// 2. Redistributions in binary form must reproduce the above copyright
13+
// notice, this list of conditions and the following disclaimer in the
14+
// documentation and/or other materials provided with the distribution.
15+
// 3. Neither the name of Genome Research Ltd nor the names of the
16+
// contributors may be used to endorse or promote products derived from
17+
// software without specific prior written permission.
18+
//
19+
// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20+
// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21+
// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22+
// IN NO EVENT SHALL GENOME RESEARCH LTD. BE LIABLE FOR ANY DIRECT, INDIRECT,
23+
// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24+
// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
25+
// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26+
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27+
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28+
// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29+
//
30+
31+
#ifndef _EGT_H
32+
#define _EGT_H
33+
34+
#include <string>
35+
#include <iostream>
36+
#include <fstream>
37+
38+
using namespace std;
39+
40+
union numericConverter {
41+
float ncFloat;
42+
int ncInt;
43+
char ncChar[sizeof(int)];
44+
};
45+
46+
class Egt {
47+
48+
public:
49+
Egt(bool verbose=false);
50+
//~Egt();
51+
void getClusters(long index, float params[]);
52+
void getMeanR(long index, float means[]);
53+
void getMeanTheta(long index, float means[]);
54+
void open(char *filename);
55+
void open(string filename);
56+
void printHeader();
57+
void printPreface();
58+
string filename;
59+
bool verbose;
60+
// constants
61+
int NUMERIC_BYTES;
62+
int ENTRIES_IN_RECORD;
63+
int BYTES_IN_RECORD;
64+
int GENOTYPES_PER_SNP;
65+
int ENTRIES_TO_USE;
66+
int PARAMS_PER_SNP;
67+
// EGT header fields
68+
long fileVersion;
69+
string gcVersion;
70+
string clusterVersion;
71+
string callVersion;
72+
string normalizationVersion;
73+
string dateCreated;
74+
char mode;
75+
string manifest;
76+
// EGT 'file preface' fields
77+
long dataVersion;
78+
string opa;
79+
long snpTotal;
80+
// arrays for numerical data
81+
int *counts;
82+
float *params;
83+
// array for SNP names
84+
string *snpNames;
85+
86+
private:
87+
int* bytesToInts(char block[], int start, int end);
88+
float* bytesToFloats(char block[], int start, int end);
89+
numericConverter getNextConverter(ifstream &file);
90+
void readHeader(ifstream &file);
91+
int readInteger(ifstream &file);
92+
float readFloat(ifstream &file);
93+
void readPreface(ifstream &file);
94+
void readSNPNames(ifstream &file, string names[]);
95+
string readString(ifstream &file);
96+
97+
};
98+
99+
#endif // _EGT_H

0 commit comments

Comments
 (0)