Skip to content

Commit

Permalink
updated until purge_dups
Browse files Browse the repository at this point in the history
  • Loading branch information
LiaOb21 committed Dec 14, 2023
1 parent b73cf19 commit 346d756
Show file tree
Hide file tree
Showing 5 changed files with 101 additions and 0 deletions.
18 changes: 18 additions & 0 deletions workflow/rules/hifiasm.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# This rule uses hifiasm to assemble the genome after filtering out organelles reads
# Following the instructions for assembling heterozygous genomes from the hifiasm github

rule run_hifiasm:
input:
organelles_filtered = "results/reads/hifi/filtered_orgs_reads.fastq.gz"
output:
gfa = "results/assemblies/hifiasm/hifiasm.asm.bp.p_ctg.gfa",
fasta = "results/assemblies/hifiasm/hifiasm.asm.bp.p_ctg.fa"
conda:
"../envs/hifiasm.yaml"
params:
optional_params = " ".join(f'{k} {v}' for k, v in config['hifiasm']['optional_params'].items() if v)
shell:
"""
hifiasm {input.organelles_filtered} -t {config[hifiasm][t]} -o results/assemblies/hifiasm/hifiasm.asm {params.optional_params}
awk '/^S/{{print ">"$$2;print $$3}}' {output.gfa} > {output.fasta}
"""
17 changes: 17 additions & 0 deletions workflow/rules/minimap2.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# This rule uses minimap2 to align the fastq reads to the organelles fasta

rule run_minimap2:
input:
mito = "results/oatk/oatk.asm.mito.ctg.fasta",
pltd = "results/oatk/oatk.asm.pltd.ctg.fasta",
fastq = "results/reads/hifi/hifi.fastq.gz"
output:
mito_sam = "results/minimap2/aln.mito.sam",
pltd_sam = "results/minimap2/aln.pltd.sam",
conda:
"../envs/minimap2.yaml"
shell:
"""
minimap2 -ax map-hifi {input.mito} -t {config[minimap2][t]} {input.fastq} > {output.mito_sam}
minimap2 -ax map-hifi {input.pltd} -t {config[minimap2][t]} {input.fastq} > {output.pltd_sam}
"""
30 changes: 30 additions & 0 deletions workflow/rules/purge_dups.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# This rule uses purge_dups purge haplotigs and overlaps in the assembly produced by hifiasm

rule run_purge_dups:
input:
reads = "results/reads/hifi/filtered_orgs_reads.fastq.gz",
fasta = "results/assemblies/hifiasm/hifiasm.asm.bp.p_ctg.fa"
output:
paf = "results/assemblies/purge_dups/hifi_vs_hifiasm_contigs.paf.gz",
pcbstat_stat = "results/assemblies/purge_dups/PB.stat",
pcbstat_cov = "results/assemblies/purge_dups/PB.base.cov",
split_fa = "results/assemblies/purge_dups/hifiasm.asm.split",
self_paf = "results/assemblies/purge_dups/hifiasm.split.self.paf.gz",
dups_bed = "results/assemblies/purge_dups/dups.bed",
log = "results/assemblies/purge_dups/purge_dups.log",
purged_fasta = "results/assemblies/purge_dups/purged.fa",
fasta_rename = "results/assemblies/purge_dups/hifiasm.asm.purged.fa"
conda:
"../envs/purge_dups.yaml"
shell:
"""
minimap2 -xasm20 {input.fasta} {input.reads} -t {config[minimap2][t]} | gzip -c - > {output.paf}
pbcstat {output.paf}
mv PB.* results/assemblies/purge_dups/
calcutus {output.pcbstat_stat} > cutoffs 2>calcults.log
split_fa {input.fasta} > {output.split_fa}
minimap2 -xasm5 -DP {output.split_fa} {output.split_fa} -t {config[minimap2][t]} | gzip -c - > {output.self_paf}
purge_dups -2 -T cutoffs -c {output.pcbstat_cov} {output.self_paf} > {output.dups_bed} 2> {output.log}
get_seqs -e {output.dups_bed} {input.fasta} > {output.purged_fasta}
mv {output.purged_fasta} {output.fasta_rename}
"""
22 changes: 22 additions & 0 deletions workflow/rules/samtools.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# This rule uses samtools and shell commands to extract the name of the fastq reads aligning to organelles fasta for their subsequent removal

rule run_samtools:
input:
mito = "results/minimap2/aln.mito.sam",
pltd = "results/minimap2/aln.pltd.sam"
output:
mito_bam = "results/samtools/mito.sorted.bam",
pltd_bam = "results/samtools/pltd.sorted.bam",
mito_reads = "results/samtools/mapped.mito.reads.txt",
pltd_reads = "results/samtools/mapped.pltd.reads.txt",
organelles = "results/samtools/organelles.reads.txt"
conda:
"../envs/samtools.yaml"
shell:
"""
samtools view -Sb {input.mito} | samtools sort -o {output.mito_bam}
samtools view -Sb {input.pltd} | samtools sort -o {output.pltd_bam}
samtools view -F 4 {output.mito_bam} | cut -f1 > {output.mito_reads}
samtools view -F 4 {output.pltd_bam} | cut -f1 > {output.pltd_reads}
cat {output.mito_reads} {output.pltd_reads} | sort | uniq > {output.organelles}
"""
14 changes: 14 additions & 0 deletions workflow/rules/seqkit.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# This rule uses seqkit to filter organelle reads from the initial fastq files

rule run_seqkit:
input:
organelles = "results/samtools/organelles.reads.txt",
fastq = "results/reads/hifi/hifi.fastq.gz"
output:
organelles_filtered = "results/reads/hifi/filtered_orgs_reads.fastq.gz"
conda:
"../envs/seqkit.yaml"
shell:
"""
seqkit grep -f {input.organelles} -v {input.fastq} | gzip > {output.organelles_filtered}
"""

0 comments on commit 346d756

Please sign in to comment.