-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathgrouping.hpp
150 lines (133 loc) · 5.46 KB
/
grouping.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#ifndef grouping_hpp_INCLUDED
#define grouping_hpp_INCLUDED
#include "CmdLineArgs.hpp"
#include "common.hpp"
#include "iohts.hpp"
#include "MolecularID.hpp"
#include "htslib/sam.h"
#include <cassert>
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <string>
#include <tuple>
#include <vector>
#include <limits.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
#define logDEBUGx1 logDEBUG // set to logINFO to enable it
struct SamIter {
const std::string input_bam_fname;
const std::string & tier1_target_region;
const std::string region_bed_fname;
const int64_t bed_in_avg_sequencing_DP;
const uvc1_flag_t bed_in_avg_sequencing_DP_n_from_t;
const size_t nthreads;
const int64_t mem_per_thread;
const bool is_fastq_gen;
samFile *sam_infile = NULL;
bam_hdr_t *samheader = NULL;
hts_idx_t *sam_idx = NULL;
hts_itr_t *sam_itr = NULL;
bam1_t *alnrecord = bam_init1();
uvc1_refgpos_t last_it_tid = -1;
uvc1_refgpos_t last_it_beg = -1;
uvc1_refgpos_t last_it_end = -1;
std::vector<BedLine> _bedlines;
size_t _bedregion_idx = 0;
SamIter(const CommandLineArgs ¶mset):
input_bam_fname(paramset.bam_input_fname),
tier1_target_region(paramset.tier1_target_region),
region_bed_fname(paramset.bed_region_fname),
bed_in_avg_sequencing_DP(paramset.bed_in_avg_sequencing_DP),
bed_in_avg_sequencing_DP_n_from_t(paramset.bed_in_avg_sequencing_DP_n_from_t),
nthreads(paramset.max_cpu_num),
mem_per_thread(paramset.mem_per_thread),
is_fastq_gen(paramset.fam_consensus_out_fastq.size() > 0) {
this->sam_infile = sam_open(input_bam_fname.c_str(), "r");
if (NULL == this->sam_infile) {
fprintf(stderr, "Failed to open the file %s!", input_bam_fname.c_str());
abort();
}
this->samheader = sam_hdr_read(sam_infile);
if (NULL == this->samheader) {
fprintf(stderr, "Failed to read the header of the file %s!", input_bam_fname.c_str());
abort();
}
if (IS_PROVIDED(this->tier1_target_region)) {
this->sam_idx = sam_index_load(this->sam_infile, input_bam_fname.c_str());
if (NULL == this->sam_idx) {
fprintf(stderr, "Failed to load the index for the file %s!", input_bam_fname.c_str());
abort();
}
this->sam_itr = sam_itr_querys(this->sam_idx, this->samheader, this->tier1_target_region.c_str());
if (NULL == this->sam_itr) {
fprintf(stderr, "Failed to load the region %s in the indexed file %s!", tier1_target_region.c_str(), input_bam_fname.c_str());
abort();
}
target_region_to_contigs(this->_bedlines, this->tier1_target_region, this->samheader);
} else if (IS_PROVIDED(this->region_bed_fname)) {
this->sam_idx = sam_index_load(this->sam_infile, input_bam_fname.c_str());
if (NULL == this->sam_idx) {
fprintf(stderr, "Failed to load the index for the file %s!", input_bam_fname.c_str());
abort();
}
bed_fname_to_contigs(this->_bedlines, this->region_bed_fname, this->samheader);
}
}
~SamIter() {
bam_destroy1(alnrecord);
if (NULL != sam_itr) { sam_itr_destroy(sam_itr); }
if (NULL != sam_idx) { hts_idx_destroy(sam_idx); }
bam_hdr_destroy(samheader);
sam_close(sam_infile);
}
int
bed_fname_to_contigs(
std::vector<BedLine> & bedlines,
const std::string & bed_fname,
const bam_hdr_t *bam_hdr);
int
target_region_to_contigs(
std::vector<BedLine> & bedlines,
const std::string & tier1_target_region,
const bam_hdr_t *bam_hdr);
int64_t
iternext(
uvc1_flag_t & iter_ret_flag,
std::vector<BedLine> & bedlines,
const uvc1_flag_t specialflag IGNORE_UNUSED_PARAM);
};
int
samfname_to_tid_to_tname_tseq_tup_vec(
std::vector<std::tuple<std::string, uvc1_refgpos_t>> & tid_to_tname_tseqlen_tuple_vec,
const std::string & bam_input_fname);
int
clean_fill_strand_umi_readset(
std::vector<std::array<std::vector<std::vector<bam1_t *>>, 2>> &umi_strand_readset);
int
fill_strand_umi_readset_with_strand_to_umi_to_reads(
std::vector<std::pair<std::array<std::vector<std::vector<bam1_t *>>, 2>, MolecularBarcode>> &umi_strand_readset,
std::map<MolecularBarcode, std::pair<std::array<std::map<uvc1_hash_t, std::vector<bam1_t *>>, 2>, MolecularBarcode>> &umi_to_strand_to_reads,
const CommandLineArgs & paramset,
uvc1_flag_t specialflag);
std::array<uvc1_readnum_big_t, 3>
bamfname_to_strand_to_familyuid_to_reads(
std::map<MolecularBarcode, std::pair<std::array<std::map<uvc1_hash_t, std::vector<bam1_t *>>, 2>, MolecularBarcode>> &umi_to_strand_to_reads,
uvc1_refgpos_t & extended_inclu_beg_pos,
uvc1_refgpos_t & extended_exclu_end_pos,
uvc1_refgpos_t tid,
uvc1_refgpos_t fetch_tbeg,
uvc1_refgpos_t fetch_tend,
bool end2end,
size_t regionbatch_ordinal,
size_t regionbatch_tot_num,
const std::string UMI_STRUCT_STRING,
samFile *sam_file,
const hts_idx_t * hts_idx,
size_t thread_id,
const CommandLineArgs & paramset,
uvc1_flag_t specialflag);
#endif