forked from kcajj/recombinant_population_analysis
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
22 changed files
with
600 additions
and
202 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,174 +1,62 @@ | ||
configfile: "config.yml" | ||
import yaml | ||
import pathlib | ||
import numpy as np | ||
import json | ||
import sys | ||
|
||
#reads = 'data/reads_for_test/test_{population}_{timestep}.fastq.bgz' | ||
reads = 'data/reads/{population}_{timestep}.fastq.gz' | ||
references = 'data/references.fasta' | ||
|
||
rule msa: | ||
input: | ||
reference = references | ||
output: | ||
msa = 'results/msa/msa_refs.fasta' | ||
conda: | ||
'conda_envs/read_mapping.yml' | ||
shell: | ||
""" | ||
mafft --auto \ | ||
{input.reference} \ | ||
> {output.msa} | ||
""" | ||
|
||
rule hybrid_ref: | ||
input: | ||
msa = rules.msa.output.msa | ||
output: | ||
hybrid_ref = 'results/msa/hybrid_ref.fasta' | ||
conda: | ||
'conda_envs/scientific_python.yml' | ||
shell: | ||
""" | ||
python scripts/hybrid_reference.py \ | ||
--msa {input.msa} \ | ||
--out {output.hybrid_ref} | ||
""" | ||
|
||
rule read_mapping: | ||
input: | ||
fastq=reads, | ||
ref=rules.hybrid_ref.output.hybrid_ref | ||
output: | ||
sam = 'results/alignments/{population}/{timestep}/{population}_{timestep}.sam', | ||
bam = 'results/alignments/{population}/{timestep}/{population}_{timestep}.bam', | ||
bai = 'results/alignments/{population}/{timestep}/{population}_{timestep}.bam.bai' | ||
conda: | ||
'conda_envs/read_mapping.yml' | ||
shell: | ||
""" | ||
minimap2 -a -x map-ont {input.ref} {input.fastq} > {output.sam} | ||
samtools sort {output.sam} > {output.bam} | ||
samtools index {output.bam} | ||
""" | ||
|
||
rule evidence_arrays: | ||
input: | ||
bam=rules.read_mapping.output.bam, | ||
msa = rules.msa.output.msa | ||
output: | ||
evidences='results/evidence_arrays/{population}/{population}_{timestep}.tsv', | ||
coverage='results/coverage_arrays/{population}/{population}_{timestep}.npz' | ||
conda: | ||
'conda_envs/scientific_python.yml' | ||
params: | ||
length_threshold = config["length_threshold"] | ||
shell: | ||
""" | ||
python scripts/extract_evidence_arrays.py \ | ||
--bam {input.bam} \ | ||
--msa_refs {input.msa} \ | ||
--evidences_out {output.evidences} \ | ||
--coverage_out {output.coverage} \ | ||
--length_threshold {params.length_threshold} | ||
""" | ||
|
||
rule prediction_arrays: | ||
input: | ||
evidences=rules.evidence_arrays.output.evidences, | ||
msa = rules.msa.output.msa | ||
output: | ||
predictions='results/prediction_arrays/{population}/{population}_{timestep}.tsv' | ||
conda: | ||
'conda_envs/scientific_python.yml' | ||
params: | ||
cores = config["cores"], | ||
initial_probability = config["initial_probability"]["A"]+","+config["initial_probability"]["B"], | ||
transition_probability = config["transition_probability"]["A"]["A"]+","+config["transition_probability"]["A"]["B"]+"/"+config["transition_probability"]["B"]["A"]+","+config["transition_probability"]["B"]["B"], | ||
emission_probability = config["emission_probability"]["A"][0]+","+config["emission_probability"]["A"][1]+","+config["emission_probability"]["A"][2]+"/"+config["emission_probability"]["B"][0]+","+config["emission_probability"]["B"][1]+","+config["emission_probability"]["B"][2] | ||
shell: | ||
""" | ||
python scripts/hmm_prediction_arrays.py \ | ||
--evidences {input.evidences} \ | ||
--msa_refs {input.msa} \ | ||
--out {output.predictions} \ | ||
--cores {params.cores} \ | ||
--initial_p {params.initial_probability}\ | ||
--transition_p {params.transition_probability}\ | ||
--emission_p {params.emission_probability} | ||
""" | ||
|
||
rule genomewide_recombination_array: | ||
input: | ||
predictions=rules.prediction_arrays.output.predictions, | ||
msa = rules.msa.output.msa | ||
output: | ||
genomewide_recombination='results/genomewide_recombination/{population}/{population}_{timestep}.npz', | ||
genomewide_recombination_01='results/genomewide_recombination/{population}/{population}_{timestep}_01.npz', | ||
genomewide_recombination_10='results/genomewide_recombination/{population}/{population}_{timestep}_10.npz' | ||
conda: | ||
'conda_envs/scientific_python.yml' | ||
shell: | ||
""" | ||
python scripts/genomewide_recombination.py \ | ||
--predictions {input.predictions} \ | ||
--msa_refs {input.msa} \ | ||
--out {output.genomewide_recombination} \ | ||
--out_01 {output.genomewide_recombination_01} \ | ||
--out_10 {output.genomewide_recombination_10} | ||
""" | ||
|
||
rule plot_recombination_array: | ||
input: | ||
recombination=rules.genomewide_recombination_array.output.genomewide_recombination, | ||
coverage=rules.evidence_arrays.output.coverage | ||
output: | ||
plots='results/plots/{population}/total_recombination/{population}_{timestep}.png' | ||
conda: | ||
'conda_envs/scientific_python.yml' | ||
shell: | ||
""" | ||
python scripts/plot_recombination_array.py \ | ||
--recombination {input.recombination} \ | ||
--coverage {input.coverage} \ | ||
--out {output.plots} | ||
""" | ||
|
||
rule plot_directional_recombination: | ||
input: | ||
recombination_01=rules.genomewide_recombination_array.output.genomewide_recombination_01, | ||
recombination_10=rules.genomewide_recombination_array.output.genomewide_recombination_10, | ||
coverage=rules.evidence_arrays.output.coverage, | ||
msa = rules.msa.output.msa | ||
output: | ||
plots='results/plots/{population}/directional_recombination/{population}_{timestep}.png' | ||
conda: | ||
'conda_envs/scientific_python.yml' | ||
shell: | ||
""" | ||
python scripts/plot_directional_recombination.py \ | ||
--recombination_01 {input.recombination_01} \ | ||
--recombination_10 {input.recombination_10} \ | ||
--coverage {input.coverage} \ | ||
--msa_refs {input.msa} \ | ||
--out {output.plots} | ||
""" | ||
|
||
rule plot_references_coverage: | ||
input: | ||
predictions=rules.prediction_arrays.output.predictions, | ||
msa = rules.msa.output.msa | ||
output: | ||
plots='results/plots/{population}/references_coverage/{population}_{timestep}.png', | ||
conda: | ||
'conda_envs/scientific_python.yml' | ||
shell: | ||
""" | ||
python scripts/plot_references_coverage.py \ | ||
--predictions {input.predictions} \ | ||
--msa_refs {input.msa} \ | ||
--out {output.plots} | ||
""" | ||
# check that config file is provided | ||
def check_configfile(): | ||
try: | ||
msg = f""" | ||
----------- loading config file ----------- | ||
--- run config: | ||
{config["run_config"]} | ||
--- alignments: | ||
{config["alignments"]} | ||
--- HMM_parameters: | ||
{config["HMM_parameters"]} | ||
--- plots | ||
{config["plots"]} | ||
------------------------------------------- | ||
""" | ||
print(msg) | ||
except: | ||
raise Exception( | ||
"config file not specified. Please specify with --configfile flag." | ||
) | ||
|
||
|
||
check_configfile() | ||
|
||
# make create log folder, required for cluster execution | ||
pathlib.Path("log").mkdir(exist_ok=True) | ||
|
||
|
||
# extract pileup config options | ||
run_config = config["run_config"] | ||
|
||
in_fld = run_config["input"].removesuffix("/") | ||
out_fld = run_config["output"].removesuffix("/") | ||
HMM = run_config["HMM"] | ||
|
||
# print run options | ||
print("----------- run configuration ------------") | ||
print("input folder:", in_fld) | ||
print("\noutput folder:", out_fld) | ||
print("\nHMM:", json.dumps(HMM, indent=2)) | ||
print("------------------------------------------") | ||
|
||
|
||
include: "rules/HMM.smk" | ||
include: "rules/plots.smk" | ||
|
||
|
||
rule all: | ||
input: | ||
plots=expand(rules.plot_recombination_array.output.plots, population=["P2","P3"], timestep=["1","3","5","7"]), | ||
directional_plots=expand(rules.plot_directional_recombination.output.plots, population=["P2","P3"], timestep=["1","3","5","7"]), | ||
coverage_plots=expand(rules.plot_references_coverage.output.plots, population=["P2","P3"], timestep=["1","3","5","7"]) | ||
rules.HMM_all.input, | ||
rules.plot_all.input |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
name: scientific_python | ||
name: sci_py | ||
channels: | ||
- bioconda | ||
- conda-forge | ||
|
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
# General version of the pipeline | ||
|
||
I am still working on the pipeline because we will publish a paper | ||
|
||
## Modifications | ||
|
||
- creation of a proper run_config file that can accept various configurations and experimental setting. I will use as a template the run_config file of evo-genome-analysis pipeline from Marco Molari. | ||
|
||
- implementation of time dynamics plots directly in the pipeline. this can be done by splitting the pipeline. | ||
|
||
- implementation of a coverage threshold on the plots, we don't want to plot the normalised HMM prediction if there are few reads. | ||
|
||
- implementation of a system to compress and decompress arrays. this will make the pipeline more space efficient. | ||
|
||
- prettyfying the pipeline to make it usable from more people, hiding thesis-specific stuff and exploratory scripts. |
Oops, something went wrong.