diff --git a/workflow/rules/hifiasm.smk b/workflow/rules/hifiasm.smk new file mode 100644 index 0000000..12b2cdb --- /dev/null +++ b/workflow/rules/hifiasm.smk @@ -0,0 +1,18 @@ +# This rule uses hifiasm to assemble the genome after filtering out organelles reads +# Following the instructions for assembling heterozygous genomes from the hifiasm github + +rule run_hifiasm: + input: + organelles_filtered = "results/reads/hifi/filtered_orgs_reads.fastq.gz" + output: + gfa = "results/assemblies/hifiasm/hifiasm.asm.bp.p_ctg.gfa", + fasta = "results/assemblies/hifiasm/hifiasm.asm.bp.p_ctg.fa" + conda: + "../envs/hifiasm.yaml" + params: + optional_params = " ".join(f'{k} {v}' for k, v in config['hifiasm']['optional_params'].items() if v) + shell: + """ + hifiasm {input.organelles_filtered} -t {config[hifiasm][t]} -o results/assemblies/hifiasm/hifiasm.asm {params.optional_params} + awk '/^S/{{print ">"$$2;print $$3}}' {output.gfa} > {output.fasta} + """ \ No newline at end of file diff --git a/workflow/rules/minimap2.smk b/workflow/rules/minimap2.smk new file mode 100644 index 0000000..35fdccf --- /dev/null +++ b/workflow/rules/minimap2.smk @@ -0,0 +1,17 @@ +# This rule uses minimap2 to align the fastq reads to the organelles fasta + +rule run_minimap2: + input: + mito = "results/oatk/oatk.asm.mito.ctg.fasta", + pltd = "results/oatk/oatk.asm.pltd.ctg.fasta", + fastq = "results/reads/hifi/hifi.fastq.gz" + output: + mito_sam = "results/minimap2/aln.mito.sam", + pltd_sam = "results/minimap2/aln.pltd.sam", + conda: + "../envs/minimap2.yaml" + shell: + """ + minimap2 -ax map-hifi {input.mito} -t {config[minimap2][t]} {input.fastq} > {output.mito_sam} + minimap2 -ax map-hifi {input.pltd} -t {config[minimap2][t]} {input.fastq} > {output.pltd_sam} + """ \ No newline at end of file diff --git a/workflow/rules/purge_dups.smk b/workflow/rules/purge_dups.smk new file mode 100644 index 0000000..36d66f5 --- /dev/null +++ b/workflow/rules/purge_dups.smk @@ -0,0 +1,30 @@ +# This rule uses purge_dups purge haplotigs and overlaps in the assembly produced by hifiasm + +rule run_purge_dups: + input: + reads = "results/reads/hifi/filtered_orgs_reads.fastq.gz", + fasta = "results/assemblies/hifiasm/hifiasm.asm.bp.p_ctg.fa" + output: + paf = "results/assemblies/purge_dups/hifi_vs_hifiasm_contigs.paf.gz", + pcbstat_stat = "results/assemblies/purge_dups/PB.stat", + pcbstat_cov = "results/assemblies/purge_dups/PB.base.cov", + split_fa = "results/assemblies/purge_dups/hifiasm.asm.split", + self_paf = "results/assemblies/purge_dups/hifiasm.split.self.paf.gz", + dups_bed = "results/assemblies/purge_dups/dups.bed", + log = "results/assemblies/purge_dups/purge_dups.log", + purged_fasta = "results/assemblies/purge_dups/purged.fa", + fasta_rename = "results/assemblies/purge_dups/hifiasm.asm.purged.fa" + conda: + "../envs/purge_dups.yaml" + shell: + """ + minimap2 -xasm20 {input.fasta} {input.reads} -t {config[minimap2][t]} | gzip -c - > {output.paf} + pbcstat {output.paf} + mv PB.* results/assemblies/purge_dups/ + calcutus {output.pcbstat_stat} > cutoffs 2>calcults.log + split_fa {input.fasta} > {output.split_fa} + minimap2 -xasm5 -DP {output.split_fa} {output.split_fa} -t {config[minimap2][t]} | gzip -c - > {output.self_paf} + purge_dups -2 -T cutoffs -c {output.pcbstat_cov} {output.self_paf} > {output.dups_bed} 2> {output.log} + get_seqs -e {output.dups_bed} {input.fasta} > {output.purged_fasta} + mv {output.purged_fasta} {output.fasta_rename} + """ \ No newline at end of file diff --git a/workflow/rules/samtools.smk b/workflow/rules/samtools.smk new file mode 100644 index 0000000..ab80e57 --- /dev/null +++ b/workflow/rules/samtools.smk @@ -0,0 +1,22 @@ +# This rule uses samtools and shell commands to extract the name of the fastq reads aligning to organelles fasta for their subsequent removal + +rule run_samtools: + input: + mito = "results/minimap2/aln.mito.sam", + pltd = "results/minimap2/aln.pltd.sam" + output: + mito_bam = "results/samtools/mito.sorted.bam", + pltd_bam = "results/samtools/pltd.sorted.bam", + mito_reads = "results/samtools/mapped.mito.reads.txt", + pltd_reads = "results/samtools/mapped.pltd.reads.txt", + organelles = "results/samtools/organelles.reads.txt" + conda: + "../envs/samtools.yaml" + shell: + """ + samtools view -Sb {input.mito} | samtools sort -o {output.mito_bam} + samtools view -Sb {input.pltd} | samtools sort -o {output.pltd_bam} + samtools view -F 4 {output.mito_bam} | cut -f1 > {output.mito_reads} + samtools view -F 4 {output.pltd_bam} | cut -f1 > {output.pltd_reads} + cat {output.mito_reads} {output.pltd_reads} | sort | uniq > {output.organelles} + """ \ No newline at end of file diff --git a/workflow/rules/seqkit.smk b/workflow/rules/seqkit.smk new file mode 100644 index 0000000..be6dc64 --- /dev/null +++ b/workflow/rules/seqkit.smk @@ -0,0 +1,14 @@ +# This rule uses seqkit to filter organelle reads from the initial fastq files + +rule run_seqkit: + input: + organelles = "results/samtools/organelles.reads.txt", + fastq = "results/reads/hifi/hifi.fastq.gz" + output: + organelles_filtered = "results/reads/hifi/filtered_orgs_reads.fastq.gz" + conda: + "../envs/seqkit.yaml" + shell: + """ + seqkit grep -f {input.organelles} -v {input.fastq} | gzip > {output.organelles_filtered} + """ \ No newline at end of file