|
| 1 | + |
| 2 | +// Egt.cpp |
| 3 | +// |
| 4 | +// Author: Iain Bancarz <[email protected]> |
| 5 | +// |
| 6 | +// Copyright (c) 2014 Genome Research Ltd. |
| 7 | +// |
| 8 | +// Redistribution and use in source and binary forms, with or without |
| 9 | +// modification, are permitted provided that the following conditions are met: |
| 10 | +// 1. Redistributions of source code must retain the above copyright notice, |
| 11 | +// this list of conditions and the following disclaimer. |
| 12 | +// 2. Redistributions in binary form must reproduce the above copyright |
| 13 | +// notice, this list of conditions and the following disclaimer in the |
| 14 | +// documentation and/or other materials provided with the distribution. |
| 15 | +// 3. Neither the name of Genome Research Ltd nor the names of the |
| 16 | +// contributors may be used to endorse or promote products derived from |
| 17 | +// software without specific prior written permission. |
| 18 | +// |
| 19 | +// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR |
| 20 | +// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES |
| 21 | +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. |
| 22 | +// IN NO EVENT SHALL GENOME RESEARCH LTD. BE LIABLE FOR ANY DIRECT, INDIRECT, |
| 23 | +// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, |
| 24 | +// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF |
| 25 | +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| 26 | +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 27 | +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF |
| 28 | +// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 29 | +// |
| 30 | + |
| 31 | +/* |
| 32 | + * Egt is a class to parse Illumina EGT cluster files. |
| 33 | + * |
| 34 | + * This is a port of the EGT Python class in zCall to C++. |
| 35 | + * There is no public documentation for the EGT binary file format. |
| 36 | + * (However, the zCall EGT class is claimed to originate from Illumina.) |
| 37 | + * |
| 38 | + * Repository for zCall: https://github.com/wtsi-npg/zCall |
| 39 | + * |
| 40 | + * Native EGT format stores coordinates as (R, Theta) polar coordinates |
| 41 | + * Python EGT class converts to Cartesian for storage in memory |
| 42 | + * This class stores as polar |
| 43 | + * (More efficient than converting and storing as Cartesian, since both are |
| 44 | +needed for FCR output) |
| 45 | + * |
| 46 | + * IMPORTANT: Illumina defines its own polar coordinates: |
| 47 | + * - Theta is rescaled, such that 1 unit = pi/2 radians |
| 48 | + * - R is equal to x+y, *not* sqrt(x^2 + y^2) |
| 49 | + * - R is a "Manhattan distance", not Euclidean distance |
| 50 | + */ |
| 51 | + |
| 52 | +#include <string> |
| 53 | +#include "Egt.h" |
| 54 | + |
| 55 | +using namespace std; |
| 56 | + |
| 57 | +Egt::Egt(bool verbose) |
| 58 | +{ |
| 59 | + this->verbose = verbose; |
| 60 | + GENOTYPES_PER_SNP = 3; |
| 61 | + PARAMS_PER_SNP = 12; |
| 62 | + NUMERIC_BYTES = 4; |
| 63 | + ENTRIES_IN_RECORD = 30; |
| 64 | + BYTES_IN_RECORD = NUMERIC_BYTES * ENTRIES_IN_RECORD; |
| 65 | + ENTRIES_TO_USE = 15; |
| 66 | +} |
| 67 | + |
| 68 | +void Egt::open(string filename) |
| 69 | +{ |
| 70 | + this->filename = filename; |
| 71 | + ifstream file; |
| 72 | + |
| 73 | + file.open(filename.c_str()); |
| 74 | + if (!file) { |
| 75 | + cerr << "Can't open file: " << filename << endl << flush; |
| 76 | + exit(1); |
| 77 | + } |
| 78 | + // read header data |
| 79 | + readHeader(file); |
| 80 | + readPreface(file); |
| 81 | + if (verbose) { |
| 82 | + printHeader(); |
| 83 | + printPreface(); |
| 84 | + } |
| 85 | + // read cluster data |
| 86 | + // expected cluster positions, in polar coordinates (R, Theta) for (AA,AB,BB) |
| 87 | + // Order of params is: devRAA, devRAB, devRBB, meanRAA, meanRAB, meanRBB, devThetaAA, devThetaAB, devThetaBB, meanThetaAA, meanThetaAB, meanThetaBB |
| 88 | + // Can't use a multidimensional array because we don't know snpTotal at compile time; instead use a pseudo-multidimensional array such that the (i,j)th value is (i*WIDTH + j) |
| 89 | + counts = new int[GENOTYPES_PER_SNP*snpTotal]; // nAA, nAB, nBB |
| 90 | + params = new float[PARAMS_PER_SNP*snpTotal]; |
| 91 | + char *block = new char[BYTES_IN_RECORD]; |
| 92 | + for (int i=0; i<snpTotal; i++) { |
| 93 | + file.read(block, BYTES_IN_RECORD); |
| 94 | + int *ints = bytesToInts(block, 0, GENOTYPES_PER_SNP); |
| 95 | + float *floats = bytesToFloats(block, GENOTYPES_PER_SNP, ENTRIES_IN_RECORD); |
| 96 | + for (int j=0;j<GENOTYPES_PER_SNP;j++) |
| 97 | + counts[i*GENOTYPES_PER_SNP + j] = ints[j]; |
| 98 | + for (int j=0;j<PARAMS_PER_SNP;j++) |
| 99 | + params[i*PARAMS_PER_SNP + j] = floats[j]; |
| 100 | + delete floats; |
| 101 | + delete ints; |
| 102 | + } |
| 103 | + delete block; |
| 104 | + snpNames = new string[snpTotal]; |
| 105 | + readSNPNames(file, snpNames); |
| 106 | + file.close(); |
| 107 | +} |
| 108 | + |
| 109 | +void Egt::open(char *filename) |
| 110 | +{ |
| 111 | + string f = filename; |
| 112 | + open(f); |
| 113 | +} |
| 114 | + |
| 115 | + |
| 116 | +int* Egt::bytesToInts(char block[], int start, int end) { |
| 117 | + // convert a section of a byte array into ints |
| 118 | + // start, end indices refer to positions in the array of ints (not bytes) |
| 119 | + int *results = new int[end - start]; |
| 120 | + numericConverter converter; |
| 121 | + for (int i=start;i<end;i++) { |
| 122 | + for (int j=0;j<NUMERIC_BYTES;j++) { |
| 123 | + converter.ncChar[j] = block[i*NUMERIC_BYTES + j]; |
| 124 | + } |
| 125 | + results[i-start] = converter.ncInt; |
| 126 | + } |
| 127 | + return results; |
| 128 | +} |
| 129 | + |
| 130 | +float* Egt::bytesToFloats(char block[], int start, int end) { |
| 131 | + // convert a section of a byte array into floats |
| 132 | + // start, end indices refer to positions in the array of floats (not bytes) |
| 133 | + float *results = new float[end - start]; |
| 134 | + numericConverter converter; |
| 135 | + for (int i=start;i<end;i++) { |
| 136 | + for (int j=0;j<NUMERIC_BYTES;j++) { |
| 137 | + converter.ncChar[j] = block[i*NUMERIC_BYTES + j]; |
| 138 | + } |
| 139 | + results[i-start] = converter.ncFloat; |
| 140 | + } |
| 141 | + return results; |
| 142 | +} |
| 143 | + |
| 144 | +void Egt::getClusters(long index, float snp_params[]) { |
| 145 | + // return all (theta, R) params for given position in the SNP manifest |
| 146 | + // includes s.d. and mean of (r, theta) for AA, AB, BB |
| 147 | + int start = index*PARAMS_PER_SNP; |
| 148 | + for (int j=0; j < PARAMS_PER_SNP; j++) { |
| 149 | + snp_params[j] = this->params[start + j]; |
| 150 | + } |
| 151 | +} |
| 152 | + |
| 153 | +void Egt::getMeanR(long index, float means[]) { |
| 154 | + // find mean polar radius for AA, AB, BB at given index |
| 155 | + int start = index*PARAMS_PER_SNP + GENOTYPES_PER_SNP; |
| 156 | + for (int j=0; j < GENOTYPES_PER_SNP; j++) { |
| 157 | + means[j] = this->params[start + j]; |
| 158 | + } |
| 159 | +} |
| 160 | + |
| 161 | +void Egt::getMeanTheta(long index, float means[]) { |
| 162 | + // find mean polar angle for AA, AB, BB at given index |
| 163 | + int start = index*PARAMS_PER_SNP + 3*GENOTYPES_PER_SNP; |
| 164 | + for (int j=0; j < GENOTYPES_PER_SNP; j++) { |
| 165 | + means[j] = this->params[start + j]; |
| 166 | + } |
| 167 | +} |
| 168 | + |
| 169 | +numericConverter Egt::getNextConverter(ifstream &file) { |
| 170 | + // convenience method to read the next few bytes into a union |
| 171 | + // can then use the union for numeric conversion |
| 172 | + char * buffer; |
| 173 | + buffer = new char[NUMERIC_BYTES]; |
| 174 | + file.read(buffer, NUMERIC_BYTES); |
| 175 | + numericConverter converter; |
| 176 | + for (int i=0; i<NUMERIC_BYTES; i++) { |
| 177 | + converter.ncChar[i] = buffer[i]; |
| 178 | + } |
| 179 | + delete buffer; |
| 180 | + return converter; |
| 181 | +} |
| 182 | + |
| 183 | +float Egt::readFloat(ifstream &file) { |
| 184 | + float result; |
| 185 | + numericConverter converter = getNextConverter(file); |
| 186 | + result = converter.ncFloat; |
| 187 | + return result; |
| 188 | + |
| 189 | +} |
| 190 | + |
| 191 | +void Egt::readHeader(ifstream &file) |
| 192 | +{ |
| 193 | + // populate instance variables with header values |
| 194 | + file.seekg(0); // set read position to zero, if not already there |
| 195 | + fileVersion = readInteger(file); |
| 196 | + gcVersion = readString(file); |
| 197 | + clusterVersion = readString(file); |
| 198 | + callVersion = readString(file); |
| 199 | + normalizationVersion = readString(file); |
| 200 | + dateCreated = readString(file); |
| 201 | + mode = file.get(); |
| 202 | + manifest = readString(file); |
| 203 | +} |
| 204 | + |
| 205 | +int Egt::readInteger(ifstream &file) |
| 206 | +{ |
| 207 | + int result; |
| 208 | + numericConverter converter = getNextConverter(file); |
| 209 | + result = converter.ncInt; |
| 210 | + return result; |
| 211 | +} |
| 212 | + |
| 213 | +void Egt::readPreface(ifstream &file) { |
| 214 | + // read the 'preface' from the body of an EGT file |
| 215 | + // assumes file is positioned at start of the body |
| 216 | + dataVersion = readInteger(file); |
| 217 | + opa = readString(file); |
| 218 | + snpTotal = readInteger(file); |
| 219 | +} |
| 220 | + |
| 221 | +void Egt::readSNPNames(ifstream &file, string names[]) { |
| 222 | + // read SNP names from an EGT file |
| 223 | + // assumes file is positioned at end of cluster (mean, sd) data |
| 224 | + int pos = file.tellg(); |
| 225 | + file.seekg(pos + 13 * snpTotal); // skip SNP quality scores |
| 226 | + for (int i=0;i<snpTotal;i++) { |
| 227 | + // skip genotype scores |
| 228 | + // length of strings is unknown, so must read each one and discard it |
| 229 | + readString(file); |
| 230 | + } |
| 231 | + for (int i=0;i<snpTotal;i++) { |
| 232 | + names[i] = readString(file); |
| 233 | + } |
| 234 | +} |
| 235 | + |
| 236 | +string Egt::readString(ifstream &file) { |
| 237 | + // EGT string format is as follows: |
| 238 | + // - First byte is a *signed* char encoding the string length |
| 239 | + // - Subsequent bytes contain the string |
| 240 | + // Total bytes read is (length encoded in first byte)+1 -- at most 128 |
| 241 | + int length = int(file.get()); // get a single byte |
| 242 | + if (not file.good()) |
| 243 | + throw("Cannot read length from EGT file, file state is not good"); |
| 244 | + else if (length <= 0) |
| 245 | + throw("Illegal string length in EGT file"); |
| 246 | + char * buffer; |
| 247 | + buffer = new char[length+1]; |
| 248 | + buffer[length] = '\0'; // ensure buffer ends with a null character |
| 249 | + file.read(buffer, length); |
| 250 | + if (not file.good()) |
| 251 | + throw("Cannot read string from EGT file, file state is not good"); |
| 252 | + string result = string(buffer); |
| 253 | + delete buffer; |
| 254 | + return result; |
| 255 | +} |
| 256 | + |
| 257 | +void Egt::printHeader() { |
| 258 | + // convenience method to print file header |
| 259 | + cout << "FILE_VERSION " << fileVersion << endl; |
| 260 | + cout << "GC_VERSION " << gcVersion << endl; |
| 261 | + cout << "CLUSTER_VERSION " << clusterVersion << endl; |
| 262 | + cout << "CALL_VERSION " << callVersion << endl; |
| 263 | + cout << "NORMALIZATION_VERSION " << normalizationVersion << endl; |
| 264 | + cout << "DATE_CREATED " << dateCreated << endl; |
| 265 | + cout << "MODE " << (int) mode << endl; |
| 266 | + cout << "MANIFEST " << manifest << endl; |
| 267 | +} |
| 268 | + |
| 269 | +void Egt::printPreface() { |
| 270 | + // convenience method to print preface of "main" data section |
| 271 | + cout << "DATA_VERSION " << dataVersion << endl; |
| 272 | + cout << "OPA " << opa << endl; |
| 273 | + cout << "TOTAL_SNPS " << snpTotal << endl; |
| 274 | +} |
0 commit comments