|
17 | 17 | */ |
18 | 18 |
|
19 | 19 | #include "smithlab_os.hpp" |
20 | | -#include "QualityScore.hpp" |
21 | 20 | #include "smithlab_utils.hpp" |
22 | 21 |
|
| 22 | +#include <dirent.h> |
| 23 | +#include <sys/stat.h> |
| 24 | +#include <unistd.h> |
| 25 | + |
23 | 26 | #include <algorithm> |
24 | 27 | #include <cassert> |
25 | 28 | #include <cerrno> |
| 29 | +#include <cmath> |
26 | 30 | #include <cstring> |
27 | 31 | #include <filesystem> |
28 | 32 | #include <fstream> |
29 | 33 | #include <iterator> |
30 | 34 | #include <stdexcept> |
31 | 35 | #include <unordered_map> |
32 | 36 |
|
33 | | -#include <dirent.h> |
34 | | -#include <sys/stat.h> |
35 | | -#include <unistd.h> |
36 | | - |
37 | 37 | using std::begin; |
38 | 38 | using std::ios_base; |
39 | 39 | using std::runtime_error; |
@@ -198,11 +198,41 @@ inline bool is_fastq_score_line(size_t line_count) { |
198 | 198 | return ((line_count & 3ul) == 3ul); |
199 | 199 | } |
200 | 200 |
|
| 201 | +static const double neg_ten_over_log_ten = -4.342944819032517501; |
| 202 | + |
| 203 | +//// CONVERT _TO_ ERROR PROBABILITIES |
| 204 | +inline double phred_to_error_probability(const double r) { |
| 205 | + const double h = r / neg_ten_over_log_ten; |
| 206 | + return std::exp(h); |
| 207 | +} |
| 208 | +inline double solexa_to_error_probability(const double r) { |
| 209 | + const double s = r / neg_ten_over_log_ten; |
| 210 | + return std::exp(s) / (1.0 + std::exp(s)); |
| 211 | +} |
| 212 | + |
| 213 | +//// CONVERT _FROM_ QUALITY CHARACTERS (I.E. THE CHARACTERS IN FASTQ FILES) |
| 214 | +inline char quality_character_to_phred(const char c) { return char(c - 33); } |
| 215 | +inline char quality_character_to_solexa(const char c) { return char(c - 64); } |
| 216 | + |
| 217 | +//// CONVERT _TO_ QUALITY CHARACTERS (I.E. THE CHARACTERS IN FASTQ FILES) |
| 218 | +inline char phred_to_quality_character(const double h) { |
| 219 | + return char(std::min(60.0, h) + 33); |
| 220 | +} |
| 221 | +inline char solexa_to_quality_character(const double s) { |
| 222 | + return char(std::min(40.0, s) + 64); |
| 223 | +} |
| 224 | + |
| 225 | +//// CHECK FOR VALIDITY OF A FASTQ SCORE CHARACTER |
| 226 | +inline bool valid_phred_score(const char c) { return (c >= 33 && c <= 93); } |
| 227 | +inline bool valid_solexa_score(const char c) { |
| 228 | + // return (c >= 64 && c <= 104); |
| 229 | + return (c >= 59 && c <= 104); // to allow for old Solexa format |
| 230 | +} |
| 231 | + |
201 | 232 | void read_fastq_file(const char *filename, vector<string> &names, |
202 | 233 | vector<string> &sequences, |
203 | 234 | vector<vector<double>> &scores) { |
204 | | - |
205 | | - static const size_t INPUT_BUFFER_SIZE = 1000000; |
| 235 | + static constexpr auto INPUT_BUFFER_SIZE = 1000000ul; |
206 | 236 |
|
207 | 237 | std::ifstream in(filename); |
208 | 238 | if (!in) |
@@ -274,13 +304,15 @@ void read_fastq_file(const char *filename, vector<string> &names, |
274 | 304 | bool phred_scores = true, solexa_scores = true; |
275 | 305 | for (size_t i = 0; i < scrs.size() && phred_scores && solexa_scores; ++i) { |
276 | 306 | phred_scores = |
277 | | - phred_scores && (find_if(begin(scrs[i]), end(scrs[i]), [](char c) { |
278 | | - return !valid_phred_score(c); |
279 | | - }) == end(scrs[i])); |
| 307 | + phred_scores && (std::find_if(std::cbegin(scrs[i]), std::cend(scrs[i]), |
| 308 | + [](const char c) { |
| 309 | + return !valid_phred_score(c); |
| 310 | + }) == std::cend(scrs[i])); |
280 | 311 | solexa_scores = |
281 | | - solexa_scores && (find_if(begin(scrs[i]), end(scrs[i]), [](char c) { |
282 | | - return !valid_solexa_score(c); |
283 | | - }) == end(scrs[i])); |
| 312 | + solexa_scores && (std::find_if(std::cbegin(scrs[i]), std::cend(scrs[i]), |
| 313 | + [](const char c) { |
| 314 | + return !valid_solexa_score(c); |
| 315 | + }) == std::cend(scrs[i])); |
284 | 316 | } |
285 | 317 |
|
286 | 318 | if (!phred_scores && !solexa_scores) |
|
0 commit comments