-
Notifications
You must be signed in to change notification settings - Fork 25
/
Copy pathproject.yml
226 lines (201 loc) · 9.63 KB
/
project.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
#########################################
# THIS IS A TYPICAL project.yml TEMPLATE
# most of the settings present here
# are GO for mapping production data
# but nonetheless user must consider
# carefully every presented option
#########################################
#########################################
# When commmenting parameters out, make sure
# that each section still has at least one
# uncommented parameter, otherwise it
# will not get properly parsed.
#########################################
#######################################
# provide paths to your raw input data (fastq-s):
#######################################
# Fastqs can be provided as:
# -- a pairs of relative/absolute paths
# -- sra:<SRA_NUMBER>, optionally followed by the indices of the first and
# the last entry in the SRA in the form of "?start=<first>&end=<last>
input:
raw_reads_paths:
# substitute the location of your fastq files:
# an example of a 1-lane library:
your_library_name_1:
lane1:
- /path/to/your/fastq/from/library1/library1_side1.fastq.gz
- /path/to/your/fastq/from/library1/library1_side2.fastq.gz
# an example of a 2-lane library:
# mapped reads from both lanes will be pulled togheter
# and deduplicated as one.
your_library_name_2:
lane1:
- /path/to/your/fastq/from/library1/library2_lane1_side1.fastq.gz
- /path/to/your/fastq/from/library1/library2_lane1_side2.fastq.gz
lane2:
- /path/to/your/fastq/from/library1/library2_lane2_side1.fastq.gz
- /path/to/your/fastq/from/library1/library2_lane2_side2.fastq.gz
# an example of a 3-lane library:
# mapped reads from both lanes will be pulled togheter
# and deduplicated as one.
your_library_name_3:
lane1:
- /path/to/your/fastq/from/library1/library3_lane1_side1.fastq.gz
- /path/to/your/fastq/from/library1/library3_lane1_side2.fastq.gz
lane2:
- /path/to/your/fastq/from/library1/library3_lane2_side1.fastq.gz
- /path/to/your/fastq/from/library1/library3_lane2_side2.fastq.gz
lane3:
- /path/to/your/fastq/from/library1/library3_lane3_side1.fastq.gz
- /path/to/your/fastq/from/library1/library3_lane3_side2.fastq.gz
your_library_online:
lane1:
# Files can be provided as http/ftp/s3 links
- https://github.com/open2c/distiller-test-data/raw/master/fastq/MATalpha_R1/lane2/SRR2601843_1.fastq.gz
- https://github.com/open2c/distiller-test-data/raw/master/fastq/MATalpha_R1/lane2/SRR2601843_2.fastq.gz
lane2:
# Alternatively, reads can be downloaded from SRA
- sra:SRR2601842
lane3:
# or like that:
- sra:SRR2601843?start=0&end=10000
# independent libraries can be combined together
# on the level of binned-data (.cool files)
# describe your groupings of choice here:
library_groups:
libraries_1_and_2:
- your_library_name_1
- your_library_name_2
all_libraries:
- your_library_name_1
- your_library_name_2
- your_library_name_3
- your_library_online
# Truncate input fastqs to a small number of reads (e.g. 10000) for
# semi-dry test runs.
# NOTE: when the inputs are specified as an SRA number, only this number of
# reads is downloaded!
truncate_fastq_reads: 0
# Specify a reference genome to align sequenced reads.
# Provide the genome assembly name, a wildcard path to the BWA index files
# of the reference genome, and a tab-separated table with contig sizes
# (known as "chrom.sizes"). The latter is used to specify the subset and the
# order of contigs in a resulting contact map.
genome:
assembly_name: 'hg19'
bwa_index_wildcard_path: '/path/to/your/reference/genome/hg19.fa.*'
chrom_sizes_path: '/path/to/your/reference/genome/hg19.reduced.chrom.sizes'
# Choose if you want to do FastQC of the input files:
do_fastqc: False
# Control how reads are mapped to the reference genomes.
map:
# If 'chunksize' is non-zero, each input file gets split into multiple chunks,
# each mapped separately. Useful for mapping on clusters with many
# relatively weak nodes.
# The optimal chunk size is defined by the balance between mapping and merging.
# Smaller chunks (~30M) are better for clusters with many weak nodes,
# however, having >~10 chunks per run slow down merging.
chunksize: 0
# Specify extra BWA mapping options.
mapping_options: ''
# Specify fastp trim options.
#i.e. parameters
#--detect_adapter_for_pe -q 15
trim_options: ''
# A more technical option, use a custom script to split fastq files from SRA
# into two files, one per read side. By default it is true, which is
# faster (because we can use multi-threaded compression), but less
# stable. Set to false if you download files from SRA and bwa complains
# about unpaired reads.
use_custom_split: true
# Control how read alignments are converted ('parsed') into Hi-C pairs.
parse:
# If 'make_pairsam' is True, parsed Hi-C pairs will store complete
# alignment records in the SAM format (the resulting hybrid between the
# .pairs and .sam formats is called '.pairsam'). Such files can be useful for
# thorough investigation of Hi-C data. Downstream of parsing, pairsams
# are split into .pairs and .bam, and .bam alignments are tagged with
# Hi-C related information. 'make_pairsam' roughly doubles the storage
# and I/O requirements and should be used only when absolutely needed.
# NOTE: when 'make_pairsam' is False, the initial output of parsing is still
# called '.pairsam' despite missing SAM alignments, for technical reasons.
make_pairsam: False
# When 'make_pairsam' is True, enabling 'drop_seq' erases sequences and
# Phred scores from the SAM alignments in .pairsam and .bam output files.
# Enable to make lightweight .pairsam/.bam output.
# NOTE: when 'make_pairsam' is False, 'drop_seq' is ignored.
drop_seq: False
# Enable 'drop_readid' to drop readID from .pairs files to create
# lightweight .pairs files
# NOTE: does not affect alignment records in the .pairsam files and
# subsequently .bam files after .apirsam splitting.
drop_readid: True
# When 'keep_unparsed_bams' is True, distiller preserves the _immediate_
# output of bwa in a .bam format. Could be used as a faster alternative
# to 'make_pairsam' when alignments are needed, but tagging them with Hi-C
# related information is not necessary.
keep_unparsed_bams: False
# Pass extra options to pairtools parse, on top of the ones specified by
# flags 'make_pairsam', 'drop_readid', 'drop_seq'. The default value
# enables storing MAPQ scores in the .pairsam/.pairs output, which are
# used later for filtering/binning. The default walks-policy is 'mask'
# which masks complex walks in long reads.
parsing_options: '--add-columns mapq --walks-policy mask'
# Control how PCR/optical duplicates are detected in the data.
dedup:
# PCR/optical duplicates are detected as Hi-C pairs with matching locations
# on both sides. 'max_mismatch_bp' controls the maximal allowed mismatch in
# mapped locations on either side for two pairs to be still considered as
# duplicates.
max_mismatch_bp: 1
# Pass extra options to pairtools dedup, which will regulate the working engine
# (--backend can be set to "scipy", "sklearn" or "cython"; scipy by default),
# the chunk size can be set for scipy and sklearn backends: --chunksize
# is 100000 by default, which consumes several Gb per process. It can be set to
# smaller values (e.g. 10000), which will increase processing time but decrease memory.
# Note that on cluster the job can be "Killed" if the task uses more than the memory limit.
dedup_options: ''
# Control how Hi-C pairs are binned into contact maps, stored in .cool files.
bin:
# Specify which resolutions should be included in the multi-resolution .cool file.
# The lowest (base) resolution _must_ be the common denominator of all other
# resolutions.
resolutions:
- 10000000
- 5000000
- 2500000
- 1000000
- 500000
- 250000
- 100000
- 50000
- 25000
- 10000
- 5000
- 2000
- 1000
# Specify if the multi-resolution .cool output files should be balanced.
balance: true
# Pass additional parameters to cooler balance:
# balance_options: ''
# Specify additional filters applied to pairs during binning.
# Multiple filters are allowed; for each filter, all pairs satisfying the
# given filter expression will be binned into a separate cooler.
# Filters are specified using the following syntax:
# {filter_name}: '{a valid Python expression}'
filters:
no_filter: ''
mapq_30: '(mapq1>=30) and (mapq2>=30)'
########################################
# folder structure for storing results
########################################
output:
dirs:
processed_fastqs: 'results/processed_fastqs/'
mapped_parsed_sorted_chunks: 'results/mapped_parsed_sorted_chunks'
fastqc: 'results/fastqc/'
pairs_library: 'results/pairs_library'
coolers_library: 'results/coolers_library/'
coolers_library_group: 'results/coolers_library_group/'
stats_library_group: 'results/stats_library_group/'