#work_assessment-processing_gtfs_part-0_Trinity-etc.md
Table of contents
Code: Get situated
#!/bin/bash
# tmux new -s gff3
# tmux attach -t gff3
grabnode # 1, defaults
run=TRUE
# run=FALSE
[[ "${run}" == TRUE ]] &&
{
transcriptome &&
{
cd "results/2023-0215" \
|| echo "cd'ing failed; check on this..."
}
if [[ "${CONDA_DEFAULT_ENV}" != "base" ]]; then
conda deactivate
fi
source activate gff3_env
}
Code: Copy in files of interest
#!/bin/bash
# run=TRUE # Only need to run the first time
run=FALSE
[[ "${run}" == TRUE ]] &&
{
# Already ----------------------------
p_gen="${HOME}/genomes"
p_gtf="${HOME}/tsukiyamalab/kalavatt/2022-2023_RRP6-NAB3/results/2023-0215/infiles_gtf-gff3/already"
if [[ ! -d "${p_gtf}" ]]; then mkdir -p "${p_gtf}"; fi
# Check that files exist with the given paths
., "${p_gen}/combined_AG/gtf/combined_AG.gtf"
., "${p_gen}/combined_SC_KL_20S/gff3/combined_SC_KL_20S.gff3"
., "${p_gen}/combined_SC_KL_20S/gff3/combined_SC_KL.gff3"
., "${p_gen}/kluyveromyces_lactis_gca_000002515/Ensembl/55/gff3/Kluyveromyces_lactis_gca_000002515.ASM251v1.55.gff3"
., "${p_gen}/sacCer3/Ensembl/108/gtf/Saccharomyces_cerevisiae.R64-1-1.108.gtf"
., "${p_gen}/sacCer3/Ensembl/108/gtf/Saccharomyces_cerevisiae.R64-1-1.108.plus-chr-rename.gtf"
# Copy in necessary files
if [[ ! -f "${p_gtf}/combined_AG.gtf" ]]; then
cp \
"${p_gen}/combined_AG/gtf/combined_AG.gtf" \
"${p_gtf}/combined_AG.gtf"
fi
if [[ ! -f "${p_gtf}/combined_SC_KL_20S.gff3" ]]; then
cp \
"${p_gen}/combined_SC_KL_20S/gff3/combined_SC_KL_20S.gff3" \
"${p_gtf}/combined_SC_KL_20S.gff3"
fi
if [[ ! -f "${p_gtf}/combined_SC_KL.gff3" ]]; then
cp \
"${p_gen}/combined_SC_KL_20S/gff3/combined_SC_KL.gff3" \
"${p_gtf}/combined_SC_KL.gff3"
fi
if [[ ! -f "${p_gtf}/Kluyveromyces_lactis_gca_000002515.ASM251v1.55.gff3" ]]; then
cp \
"${p_gen}/kluyveromyces_lactis_gca_000002515/Ensembl/55/gff3/Kluyveromyces_lactis_gca_000002515.ASM251v1.55.gff3" \
"${p_gtf}/Kluyveromyces_lactis_gca_000002515.ASM251v1.55.gff3"
fi
if [[ ! -f "${p_gtf}/Saccharomyces_cerevisiae.R64-1-1.108.gtf" ]]; then
cp \
"${p_gen}/sacCer3/Ensembl/108/gtf/Saccharomyces_cerevisiae.R64-1-1.108.gtf" \
"${p_gtf}/Saccharomyces_cerevisiae.R64-1-1.108.gtf"
fi
if [[ ! -f "${p_gtf}/Saccharomyces_cerevisiae.R64-1-1.108.plus-chr-rename.gtf" ]]; then
cp \
"${p_gen}/sacCer3/Ensembl/108/gtf/Saccharomyces_cerevisiae.R64-1-1.108.plus-chr-rename.gtf" \
"${p_gtf}/Saccharomyces_cerevisiae.R64-1-1.108.plus-chr-rename.gtf"
fi
# Trinity-GG -------------------------
p_trinity="${HOME}/tsukiyamalab/kalavatt/2022-2023_RRP6-NAB3/results/2023-0111/outfiles_GMAP_rough-draft/Trinity-GG"
p_gtf="${HOME}/tsukiyamalab/kalavatt/2022-2023_RRP6-NAB3/results/2023-0215/infiles_gtf-gff3"
if [[ ! -d "${p_gtf}/Trinity-GG" ]]; then
echo " No \${p_gtf}/Trinity-GG... Copying it in now"
cp -r "${p_trinity}" "${p_gtf}"
fi
}
Code: Make associated storage directories
#!/bin/bash
# run=TRUE # Only need to run the first time
run=FALSE
[[ "${run}" == TRUE ]] &&
{
mkdir -p outfiles_gtf-gff3/{already,Trinity-GG}
mkdir -p outfiles_gtf-gff3/Trinity-GG/{G_N,Q_N}/err_out
mkdir -p outfiles_htseq-count/{already,Trinity-GG}
mkdir -p outfiles_htseq-count/Trinity-GG/{G_N,Q_N}/{sh,list,err_out}
}
Code: Create arrays of relevant gff3 and bam files
#!/bin/bash
run=TRUE
# run=FALSE
[[ "${run}" == TRUE ]] &&
{
unset stems
typeset -a stems
while IFS=" " read -r -d $'\0'; do
stems+=( "${REPLY%.gff3}" )
done < <(\
find "infiles_gtf-gff3" \
-type f \
-name "trinity*.gff3" \
-print0 \
| sort -z\
)
echo_test "${stems[@]}"
echo "${#stems[@]}" && echo "" # 12
unset bams
typeset -a bams
while IFS=" " read -r -d $'\0'; do
bams+=( "${REPLY}" )
done < <(\
find "bams_renamed/UT_prim_UMI" \
-type l \
-name "*ovn*bam" \
-print0 \
| sort -z \
)
echo_test "${bams[@]}"
echo "${#bams[@]}" && echo "" # 8
}
Code: Run GffRead
#!/bin/bash
# Echo test of GffRead -------------------------------------------------------
run=TRUE
# run=FALSE
[[ "${run}" == TRUE ]] &&
{
h=0
for i in "${stems[@]}"; do
# i="${stems[10]}" # echo "${i}"
in="${i}.gff3" # echo "${in}"
out="$(echo "${i}" | sed 's/infiles/outfiles/g;s/\\.gff3//g' -).gffread.gff3" # echo "${out}"
err_out="$(dirname "${out}")/err_out/02-gffread.$(basename "${out}" .gff3)" # echo "${err_out}"
fasta_g="${HOME}/genomes/combined_SC_KL_20S/fasta/fasta_individual/Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.chr-rename.fasta" # ., "${fasta_g}"
let h++
echo " # ==============================================="
printf " Iteration '%d'\n\n" "${h}"
echo """
---------------
Running gffread
---------------
in ${in}
out (base) ${out}
stdout ${err_out}.stdout.txt
stderr ${err_out}.stderr.txt
---------------------
Call to gffread: Base
---------------------
gffread \\
-v \\
-g ${fasta} \\
-i 1000 \\
-Z \\
-M -K -Q \\
-F -N -P \\
--force-exons --gene2exon \\
-o ${out} \\
<(awk -F '\t' 'BEGIN {OFS = FS} { gsub(/chr/, \"\", \$1); gsub(/M/, \"Mito\", \$1); print }' "${in}") \\
> >(tee -a ${err_out%.}.stdout.txt) \\
2> >(tee -a ${err_out%.}.stderr.txt)
--------------------------------------
Call to gffread: Intron-filtering only
--------------------------------------
gffread \\
-v -O \\
-i 1000 \\
-o ${out/.gff3/-intron-filtering-only.gff3} \\
<(awk -F '\t' 'BEGIN {OFS = FS} { gsub(/chr/, \"\", \$1); gsub(/M/, \"Mito\", \$1); print }' "${in}") \\
> >(tee -a ${err_out%.}-intron-filtering-only.stdout.txt) \\
2> >(tee -a ${err_out%.}-intron-filtering-only.stderr.txt)
----------------------------
Call to gffread: Coding only
----------------------------
gffread \\
-v \\
-g ${fasta} \\
-C -i 1000 \\
-Z \\
-M -K -Q \\
-F -N -P \\
--force-exons --gene2exon \\
-o ${out/.gff3/-coding.gff3} \\
<(awk -F '\t' 'BEGIN {OFS = FS} { gsub(/chr/, \"\", \$1); gsub(/M/, \"Mito\", \$1); print }' "${in}") \\
> >(tee -a ${err_out%.}-coding.stdout.txt) \\
2> >(tee -a ${err_out%.}-coding.stderr.txt)
--------------------------------
Call to gffread: Non-coding only
--------------------------------
gffread \\
-v \\
-g ${fasta} \\
--nc -i 1000 \\
-Z \\
-M -K -Q \\
-F -N -P \\
--force-exons --gene2exon \\
-o ${out/.gff3/-non-coding.gff3} \\
<(awk -F '\t' 'BEGIN {OFS = FS} { gsub(/chr/, \"\", \$1); gsub(/M/, \"Mito\", \$1); print }' "${in}") \\
> >(tee -a ${err_out%.}-non-coding.stdout.txt) \\
2> >(tee -a ${err_out%.}-non-coding.stderr.txt)
"""
done
}
# Running GffRead ------------------------------------------------------------
# run=TRUE
run=FALSE
[[ "${run}" == TRUE ]] &&
{
h=0
for i in "${stems[@]}"; do
# i="${stems[10]}" # echo "${i}"
in="${i}.gff3" # echo "${in}"
out="$(echo "${i}" | sed 's/infiles/outfiles/g;s/\\.gff3//g' -).gffread.gff3" # echo "${out}"
err_out="$(dirname "${out}")/err_out/02-gffread.$(basename "${out}" .gff3)" # echo "${err_out}"
fasta_g="${HOME}/genomes/combined_SC_KL_20S/fasta/fasta_individual/Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.chr-rename.fasta" # ., "${fasta_g}"
let h++
echo " # ==============================================="
printf " Iteration '%d'\n\n" "${h}"
echo """
---------------
Running gffread
---------------
in ${in}
out (base) ${out}
stdout ${err_out}.stdout.txt
stderr ${err_out}.stderr.txt
---------------------
Call to gffread: Base
---------------------
gffread \\
-v \\
-g ${fasta} \\
-i 1000 \\
-Z \\
-M -K -Q \\
-F -N -P \\
--force-exons --gene2exon \\
-o ${out} \\
<(awk -F '\t' 'BEGIN {OFS = FS} { gsub(/chr/, \"\", \$1); gsub(/M/, \"Mito\", \$1); print }' "${in}") \\
> >(tee -a ${err_out%.}.stdout.txt) \\
2> >(tee -a ${err_out%.}.stderr.txt)
--------------------------------------
Call to gffread: Intron-filtering only
--------------------------------------
gffread \\
-v -O \\
-i 1000 \\
-o ${out/.gff3/-intron-filtering-only.gff3} \\
<(awk -F '\t' 'BEGIN {OFS = FS} { gsub(/chr/, \"\", \$1); gsub(/M/, \"Mito\", \$1); print }' "${in}") \\
> >(tee -a ${err_out%.}-intron-filtering-only.stdout.txt) \\
2> >(tee -a ${err_out%.}-intron-filtering-only.stderr.txt)
----------------------------
Call to gffread: Coding only
----------------------------
gffread \\
-v \\
-g ${fasta} \\
-C -i 1000 \\
-Z \\
-M -K -Q \\
-F -N -P \\
--force-exons --gene2exon \\
-o ${out/.gff3/-coding.gff3} \\
<(awk -F '\t' 'BEGIN {OFS = FS} { gsub(/chr/, \"\", \$1); gsub(/M/, \"Mito\", \$1); print }' "${in}") \\
> >(tee -a ${err_out%.}-coding.stdout.txt) \\
2> >(tee -a ${err_out%.}-coding.stderr.txt)
--------------------------------
Call to gffread: Non-coding only
--------------------------------
gffread \\
-v \\
-g ${fasta} \\
--nc -i 1000 \\
-Z \\
-M -K -Q \\
-F -N -P \\
--force-exons --gene2exon \\
-o ${out/.gff3/-non-coding.gff3} \\
<(awk -F '\t' 'BEGIN {OFS = FS} { gsub(/chr/, \"\", \$1); gsub(/M/, \"Mito\", \$1); print }' "${in}") \\
> >(tee -a ${err_out%.}-non-coding.stdout.txt) \\
2> >(tee -a ${err_out%.}-non-coding.stderr.txt)
"""
# Base
gffread \
-v \
-g "${fasta_g}" \
-i 1000 \
-Z \
-M -K -Q \
-F -N -P \
--force-exons --gene2exon \
-o "${out}" \
<(awk -F '\t' 'BEGIN {OFS = FS} { gsub(/chr/, "", $1); gsub(/M/, "Mito", $1); print }' "${in}") \
> >(tee -a "${err_out%.}.stdout.txt") \
2> >(tee -a "${err_out%.}.stderr.txt")
# Intron-filtering only
gffread \
-v -O \
-i 1000 \
-o "${out/.gff3/-intron-filtering-only.gff3}" \
<(awk -F '\t' 'BEGIN {OFS = FS} { gsub(/chr/, "", $1); gsub(/M/, "Mito", $1); print }' "${in}") \
> >(tee -a "${err_out%.}-intron-filtering-only.stdout.txt") \
2> >(tee -a "${err_out%.}-intron-filtering-only.stderr.txt")
# Coding only
gffread \
-v \
-g "${fasta_g}" \
-C -i 1000 \
-Z \
-M -K -Q \
-F -N -P \
--force-exons --gene2exon \
-o "${out/.gff3/-coding.gff3}" \
<(awk -F '\t' 'BEGIN {OFS = FS} { gsub(/chr/, "", $1); gsub(/M/, "Mito", $1); print }' "${in}") \
> >(tee -a "${err_out%.}-coding.stdout.txt") \
2> >(tee -a "${err_out%.}-coding.stderr.txt")
# Non-coding only
gffread \
-v \
-g "${fasta_g}" \
--nc -i 1000 \
-Z \
-M -K -Q \
-F -N -P \
--force-exons --gene2exon \
-o "${out/.gff3/-non-coding.gff3}" \
<(awk -F '\t' 'BEGIN {OFS = FS} { gsub(/chr/, "", $1); gsub(/M/, "Mito", $1); print }' "${in}") \
> >(tee -a "${err_out%.}-non-coding.stdout.txt") \
2> >(tee -a "${err_out%.}-non-coding.stderr.txt")
done
}
# find ./outfiles_gtf-gff3 -type f -name "*gffread*" -delete
# find ./outfiles_gtf-gff3 -type f -name "*intron-filtering-only*" -delete
...from "base" calls to gffread
Code: Call htseq-count with respect to --type locus, mRNA, exon, and CDS
#!/bin/bash
# Echo tests for calls to htseq-count ----------------------------------------
run=TRUE
# run=FALSE
[[ "${run}" == TRUE ]] &&
{
h=0
# for i in "strd-eq" "strd-rv"; do
for i in "strd-eq"; do
for j in "${stems[@]}"; do
# i="strd-eq" # echo "${i}"
# j="${stems[7]}" # echo "${j}"
in="$(echo "${j}" | sed 's/infiles/outfiles/g' - ).gffread.gff3" # ., "${in}" # less "${in}" # Approach #2
out="$(echo "${j}" | sed 's/infiles_gtf-gff3/outfiles_htseq-count/g' - ).hc-${i}.tsv" # echo "${out}"
err_out="$(dirname "${out}")/err_out/03-htseq-count-${i}.$(basename "${out}" .tsv)" # echo "${err_out}"
if [[ "${i}" == "strd-eq" ]]; then
hc_strd="yes"
elif [[ "${i}" == "strd-rv" ]]; then
hc_strd="reverse"
fi
let h++
echo " # ==============================================="
printf " Iteration '%d'\n\n" "${h}"
echo """
--------------------------
Call to htseq-count: locus
--------------------------
sbatch \\
--job-name=\"htseq-count-locus\" \\
--nodes=1 \\
--cpus-per-task=8 \\
--error=\"${err_out}-locus.%A.stderr.txt\" \\
--output=\"${err_out}-locus.%A.stdout.txt\" \\
htseq-count \\
--order \"pos\" \\
--stranded \"${hc_strd}\" \\
--nonunique \"all\" \\
--type \"locus\" \\
--idattr \"ID\" \\
--nprocesses 8 \\
--counts_output \"${out/.tsv/-locus.tsv}\" \\
--with-header \\
\${bams[*]} \\
\"${in}\"
-------------------------
Call to htseq-count: mRNA
-------------------------
sbatch \\
--job-name=\"htseq-count-mRNA\" \\
--nodes=1 \\
--cpus-per-task=8 \\
--error=\"${err_out}-mRNA.%A.stderr.txt\" \\
--output=\"${err_out}-mRNA.%A.stdout.txt\" \\
htseq-count \\
--order \"pos\" \\
--stranded \"${hc_strd}\" \\
--nonunique \"all\" \\
--type \"mRNA\" \\
--idattr \"ID\" \\
--nprocesses 8 \\
--counts_output \"${out/.tsv/-mRNA.tsv}\" \\
--with-header \\
\${bams[*]} \\
\"${in}\"
-------------------------
Call to htseq-count: exon
-------------------------
sbatch \\
--job-name=\"htseq-count-exon\" \\
--nodes=1 \\
--cpus-per-task=8 \\
--error=\"${err_out}-exon.%A.stderr.txt\" \\
--output=\"${err_out}-exon.%A.stdout.txt\" \\
htseq-count \\
--order \"pos\" \\
--stranded \"${hc_strd}\" \\
--nonunique \"all\" \\
--type \"exon\" \\
--idattr \"Parent\" \\
--nprocesses 8 \\
--counts_output \"${out/.tsv/-exon.tsv}\" \\
--with-header \\
\${bams[*]} \\
\"${in}\"
------------------------
Call to htseq-count: CDS
------------------------
sbatch \\
--job-name=\"htseq-count-CDS\" \\
--nodes=1 \\
--cpus-per-task=8 \\
--error=\"${err_out}-CDS.%A.stderr.txt\" \\
--output=\"${err_out}-CDS.%A.stdout.txt\" \\
htseq-count \\
--order \"pos\" \\
--stranded \"${hc_strd}\" \\
--nonunique \"all\" \\
--type \"CDS\" \\
--idattr \"Parent\" \\
--nprocesses 8 \\
--counts_output \"${out/.tsv/-CDS.tsv}\" \\
--with-header \\
\${bams[*]} \\
\"${in}\"
"""
done
done
}
# Calls to htseq-count -------------------------------------------------------
# run=TRUE
run=FALSE
[[ "${run}" == TRUE ]] &&
{
h=0
# for i in "strd-eq" "strd-rv"; do
for i in "strd-eq"; do
for j in "${stems[@]}"; do
# i="strd-eq" # echo "${i}"
# j="${stems[7]}" # echo "${j}"
in="$(echo "${j}" | sed 's/infiles/outfiles/g' - ).gffread.gff3" # ., "${in}" # less "${in}" # Approach #2
out="$(echo "${j}" | sed 's/infiles_gtf-gff3/outfiles_htseq-count/g' - ).hc-${i}.tsv" # echo "${out}"
err_out="$(dirname "${out}")/err_out/03-htseq-count-${i}.$(basename "${out}" .tsv)" # echo "${err_out}"
if [[ "${i}" == "strd-eq" ]]; then
hc_strd="yes"
elif [[ "${i}" == "strd-rv" ]]; then
hc_strd="reverse"
fi
let h++
echo " # ==============================================="
printf " Iteration '%d'\n\n" "${h}"
echo """
--------------------------
Call to htseq-count: locus
--------------------------
sbatch \\
--job-name=\"htseq-count-locus\" \\
--nodes=1 \\
--cpus-per-task=8 \\
--error=\"${err_out}-locus.%A.stderr.txt\" \\
--output=\"${err_out}-locus.%A.stdout.txt\" \\
htseq-count \\
--order \"pos\" \\
--stranded \"${hc_strd}\" \\
--nonunique \"all\" \\
--type \"locus\" \\
--idattr \"ID\" \\
--nprocesses 8 \\
--counts_output \"${out/.tsv/-locus.tsv}\" \\
--with-header \\
\${bams[*]} \\
\"${in}\"
-------------------------
Call to htseq-count: mRNA
-------------------------
sbatch \\
--job-name=\"htseq-count-mRNA\" \\
--nodes=1 \\
--cpus-per-task=8 \\
--error=\"${err_out}-mRNA.%A.stderr.txt\" \\
--output=\"${err_out}-mRNA.%A.stdout.txt\" \\
htseq-count \\
--order \"pos\" \\
--stranded \"${hc_strd}\" \\
--nonunique \"all\" \\
--type \"mRNA\" \\
--idattr \"ID\" \\
--nprocesses 8 \\
--counts_output \"${out/.tsv/-mRNA.tsv}\" \\
--with-header \\
\${bams[*]} \\
\"${in}\"
-------------------------
Call to htseq-count: exon
-------------------------
sbatch \\
--job-name=\"htseq-count-exon\" \\
--nodes=1 \\
--cpus-per-task=8 \\
--error=\"${err_out}-exon.%A.stderr.txt\" \\
--output=\"${err_out}-exon.%A.stdout.txt\" \\
htseq-count \\
--order \"pos\" \\
--stranded \"${hc_strd}\" \\
--nonunique \"all\" \\
--type \"exon\" \\
--idattr \"Parent\" \\
--nprocesses 8 \\
--counts_output \"${out/.tsv/-exon.tsv}\" \\
--with-header \\
\${bams[*]} \\
\"${in}\"
------------------------
Call to htseq-count: CDS
------------------------
sbatch \\
--job-name=\"htseq-count-CDS\" \\
--nodes=1 \\
--cpus-per-task=8 \\
--error=\"${err_out}-CDS.%A.stderr.txt\" \\
--output=\"${err_out}-CDS.%A.stdout.txt\" \\
htseq-count \\
--order \"pos\" \\
--stranded \"${hc_strd}\" \\
--nonunique \"all\" \\
--type \"CDS\" \\
--idattr \"Parent\" \\
--nprocesses 8 \\
--counts_output \"${out/.tsv/-CDS.tsv}\" \\
--with-header \\
\${bams[*]} \\
\"${in}\"
"""
sbatch \
--job-name="htseq-count-locus" \
--nodes=1 \
--cpus-per-task=8 \
--error="${err_out}-locus.%A.stderr.txt" \
--output="${err_out}-locus.%A.stdout.txt" \
htseq-count \
--order "pos" \
--stranded "${hc_strd}" \
--nonunique "all" \
--type "locus" \
--idattr "ID" \
--nprocesses 8 \
--counts_output "${out/.tsv/-locus.tsv}" \
--with-header \
${bams[*]} \
"${in}"
sleep 0.33
sbatch \
--job-name="htseq-count-mRNA" \
--nodes=1 \
--cpus-per-task=8 \
--error="${err_out}-mRNA.%A.stderr.txt" \
--output="${err_out}-mRNA.%A.stdout.txt" \
htseq-count \
--order "pos" \
--stranded "${hc_strd}" \
--nonunique "all" \
--type "mRNA" \
--idattr "ID" \
--nprocesses 8 \
--counts_output "${out/.tsv/-mRNA.tsv}" \
--with-header \
${bams[*]} \
"${in}"
sleep 0.33
sbatch \
--job-name="htseq-count-exon" \
--nodes=1 \
--cpus-per-task=8 \
--error="${err_out}-exon.%A.stderr.txt" \
--output="${err_out}-exon.%A.stdout.txt" \
htseq-count \
--order "pos" \
--stranded "${hc_strd}" \
--nonunique "all" \
--type "exon" \
--idattr "Parent" \
--nprocesses 8 \
--counts_output "${out/.tsv/-exon.tsv}" \
--with-header \
${bams[*]} \
"${in}"
sleep 0.33
sbatch \
--job-name="htseq-count-CDS" \
--nodes=1 \
--cpus-per-task=8 \
--error="${err_out}-CDS.%A.stderr.txt" \
--output="${err_out}-CDS.%A.stdout.txt" \
htseq-count \
--order "pos" \
--stranded "${hc_strd}" \
--nonunique "all" \
--type "CDS" \
--idattr "Parent" \
--nprocesses 8 \
--counts_output "${out/.tsv/-CDS.tsv}" \
--with-header \
${bams[*]} \
"${in}"
sleep 0.33
done
done
}
Code: Call htseq-count with respect to the intron-filtering-only gffread files
#!/bin/bash
# Echo tests for calls to htseq-count ----------------------------------------
run=TRUE
# run=FALSE
[[ "${run}" == TRUE ]] &&
{
h=0
# for i in "strd-eq" "strd-rv"; do
for i in "strd-eq"; do
for j in "${stems[@]}"; do
# i="strd-eq" # echo "${i}"
# j="${stems[7]}" # echo "${j}"
in="$(echo "${j}" | sed 's/infiles/outfiles/g' - ).gffread-intron-filtering-only.gff3" # ., "${in}" # less "${in}"
out="$(echo "${j}" | sed 's/infiles_gtf-gff3/outfiles_htseq-count/g' - ).gffread-intron-filtering-only.hc-${i}.tsv" # echo "${out}"
err_out="$(dirname "${out}")/err_out/03-htseq-count-${i}.$(basename "${out}" .tsv)" # echo "${err_out}"
if [[ "${i}" == "strd-eq" ]]; then
hc_strd="yes"
elif [[ "${i}" == "strd-rv" ]]; then
hc_strd="reverse"
fi
let h++
printf " Iteration '%d'\n\n" "${h}"
echo """
-------------------------
Call to htseq-count: gene
-------------------------
sbatch \\
--job-name=\"htseq-count-gene\" \\
--nodes=1 \\
--cpus-per-task=8 \\
--error=\"${err_out}-gene.%A.stderr.txt\" \\
--output=\"${err_out}-gene.%A.stdout.txt\" \\
htseq-count \\
--order \"pos\" \\
--stranded \"${hc_strd}\" \\
--nonunique \"all\" \\
--type \"gene\" \\
--idattr \"ID\" \\
--nprocesses 8 \\
--counts_output \"${out/.tsv/-gene.tsv}\" \\
--with-header \\
\${bams[*]} \\
\"${in}\"
"""
done
done
}
# Calls to htseq-count -------------------------------------------------------
# run=TRUE
run=FALSE
[[ "${run}" == TRUE ]] &&
{
h=0
# for i in "strd-eq" "strd-rv"; do
for i in "strd-eq"; do
for j in "${stems[@]}"; do
# i="strd-eq" # echo "${i}"
# j="${stems[7]}" # echo "${j}"
in="$(echo "${j}" | sed 's/infiles/outfiles/g' - ).gffread-intron-filtering-only.gff3" # ., "${in}" # less "${in}"
out="$(echo "${j}" | sed 's/infiles_gtf-gff3/outfiles_htseq-count/g' - ).gffread-intron-filtering-only.hc-${i}.tsv" # echo "${out}"
err_out="$(dirname "${out}")/err_out/03-htseq-count-${i}.$(basename "${out}" .tsv)" # echo "${err_out}"
if [[ "${i}" == "strd-eq" ]]; then
hc_strd="yes"
elif [[ "${i}" == "strd-rv" ]]; then
hc_strd="reverse"
fi
let h++
printf " Iteration '%d'\n\n" "${h}"
echo """
-------------------------
Call to htseq-count: gene
-------------------------
sbatch \\
--job-name=\"htseq-count-gene\" \\
--nodes=1 \\
--cpus-per-task=8 \\
--error=\"${err_out}-gene.%A.stderr.txt\" \\
--output=\"${err_out}-gene.%A.stdout.txt\" \\
htseq-count \\
--order \"pos\" \\
--stranded \"${hc_strd}\" \\
--nonunique \"all\" \\
--type \"gene\" \\
--idattr \"ID\" \\
--nprocesses 8 \\
--counts_output \"${out/.tsv/-gene.tsv}\" \\
--with-header \\
\${bams[*]} \\
\"${in}\"
"""
sbatch \
--job-name="htseq-count-gene" \
--nodes=1 \
--cpus-per-task=8 \
--error="${err_out}-gene.%A.stderr.txt" \
--output="${err_out}-gene.%A.stdout.txt" \
htseq-count \
--order "pos" \
--stranded "${hc_strd}" \
--nonunique "all" \
--type "gene" \
--idattr "ID" \
--nprocesses 8 \
--counts_output "${out/.tsv/-gene.tsv}" \
--with-header \
${bams[*]} \
"${in}"
sleep 0.33
done
done
}
Notes: Correct tsv outfiles with formatting errors
For some reason, these files had formatting errors, so rerunning the htseq-count
jobs in the hopes that it solves the problems:
mRNA
G_N/trinity-gg_mkc-16_mir-0.05_mg-2_gf-0.05.hc-strd-eq-mRNA.tsv
Q_N/trinity-gg_mkc-2_mir-0.05_mg-2_gf-0.05.hc-strd-eq-mRNA.tsv
CDS
Q_N/trinity-gg_mkc-16_mir-0.05_mg-2_gf-0.05.hc-strd-eq-CDS.tsv
exon
Q_N/trinity-gg_mkc-16_mir-0.05_mg-2_gf-0.05.hc-strd-eq-exon.tsv
Q_N/trinity-gg_mkc-32_mir-0.05_mg-2_gf-0.05.hc-strd-eq-exon.tsv
Code: Correct tsv outfiles with formatting errors
#!/bin/bash
run=FALSE
[[ "${run}" == TRUE ]] &&
{
cd "${HOME}/tsukiyamalab/kalavatt/2022-2023_RRP6-NAB3/results/2023-0215/outfiles_htseq-count/Trinity-GG"
cd G_N/
mv \
trinity-gg_mkc-16_mir-0.05_mg-2_gf-0.05.hc-strd-eq-mRNA.tsv \
problem.trinity-gg_mkc-16_mir-0.05_mg-2_gf-0.05.hc-strd-eq-mRNA.tsv
cd ../Q_N/
mv \
trinity-gg_mkc-2_mir-0.05_mg-2_gf-0.05.hc-strd-eq-mRNA.tsv \
problem.trinity-gg_mkc-2_mir-0.05_mg-2_gf-0.05.hc-strd-eq-mRNA.tsv
mv \
trinity-gg_mkc-16_mir-0.05_mg-2_gf-0.05.hc-strd-eq-CDS.tsv \
problem.trinity-gg_mkc-16_mir-0.05_mg-2_gf-0.05.hc-strd-eq-CDS.tsv
mv \
trinity-gg_mkc-16_mir-0.05_mg-2_gf-0.05.hc-strd-eq-exon.tsv \
problem.trinity-gg_mkc-16_mir-0.05_mg-2_gf-0.05.hc-strd-eq-exon.tsv
mv \
trinity-gg_mkc-32_mir-0.05_mg-2_gf-0.05.hc-strd-eq-exon.tsv \
problem.trinity-gg_mkc-32_mir-0.05_mg-2_gf-0.05.hc-strd-eq-exon.tsv
# mRNA ------------------------------------------
run=TRUE
[[ "${run}" == TRUE ]] &&
{
# for i in "strd-eq" "strd-rv"; do
for i in "strd-eq"; do
for k in \
"infiles_gtf-gff3/Trinity-GG/G_N/trinity-gg_mkc-16_mir-0.05_mg-2_gf-0.05" \
"infiles_gtf-gff3/Trinity-GG/Q_N/trinity-gg_mkc-2_mir-0.05_mg-2_gf-0.05"
do
# i="strd-eq" # echo "${i}"
# k="${stems[7]}" # echo "${k}"
in="$(echo "${k}" | sed 's/infiles/outfiles/g' - ).gffread.gff3" # ., "${in}" # less "${in}" # Approach #2
out="$(echo "${k}" | sed 's/infiles_gtf-gff3/outfiles_htseq-count/g' - ).hc-${i}.tsv" # echo "${out}"
err_out="$(dirname "${out}")/err_out/03-htseq-count-${i}.$(basename "${out}" .tsv)" # echo "${err_out}"
if [[ "${i}" == "strd-eq" ]]; then
hc_strd="yes"
elif [[ "${i}" == "strd-rv" ]]; then
hc_strd="reverse"
fi
echo """
# -------------------------
# Call to htseq-count: mRNA
# -------------------------
sbatch \\
--job-name=\"htseq-count-mRNA\" \\
--nodes=1 \\
--cpus-per-task=8 \\
--error=\"${err_out}-mRNA.%A.stderr.txt\" \\
--output=\"${err_out}-mRNA.%A.stdout.txt\" \\
htseq-count \\
--order \"pos\" \\
--stranded \"${hc_strd}\" \\
--nonunique \"all\" \\
--type \"mRNA\" \\
--idattr \"ID\" \\
--nprocesses 8 \\
--counts_output \"${out/.tsv/-mRNA.tsv}\" \\
--with-header \\
\${bams[*]} \\
\"${in}\"
"""
sbatch \
--job-name="htseq-count-mRNA" \
--nodes=1 \
--cpus-per-task=8 \
--error="${err_out}-mRNA.%A.stderr.txt" \
--output="${err_out}-mRNA.%A.stdout.txt" \
htseq-count \
--order "pos" \
--stranded "${hc_strd}" \
--nonunique "all" \
--type "mRNA" \
--idattr "ID" \
--nprocesses 8 \
--counts_output "${out/.tsv/-mRNA.tsv}" \
--with-header \
${bams[*]} \
"${in}"
sleep 0.33
done
done
}
# CDS -------------------------------------------
run=TRUE
[[ "${run}" == TRUE ]] &&
{
# for i in "strd-eq" "strd-rv"; do
for i in "strd-eq"; do
for k in "infiles_gtf-gff3/Trinity-GG/Q_N/trinity-gg_mkc-16_mir-0.05_mg-2_gf-0.05"; do
# i="strd-eq" # echo "${i}"
# k="${stems[7]}" # echo "${k}"
in="$(echo "${k}" | sed 's/infiles/outfiles/g' - ).gffread.gff3" # ., "${in}" # less "${in}" # Approach #2
out="$(echo "${k}" | sed 's/infiles_gtf-gff3/outfiles_htseq-count/g' - ).hc-${i}.tsv" # echo "${out}"
err_out="$(dirname "${out}")/err_out/03-htseq-count-${i}.$(basename "${out}" .tsv)" # echo "${err_out}"
if [[ "${i}" == "strd-eq" ]]; then
hc_strd="yes"
elif [[ "${i}" == "strd-rv" ]]; then
hc_strd="reverse"
fi
echo """
# -------------------------
# Call to htseq-count: CDS
# -------------------------
sbatch \\
--job-name=\"htseq-count-CDS\" \\
--nodes=1 \\
--cpus-per-task=8 \\
--error=\"${err_out}-CDS.%A.stderr.txt\" \\
--output=\"${err_out}-CDS.%A.stdout.txt\" \\
htseq-count \\
--order \"pos\" \\
--stranded \"${hc_strd}\" \\
--nonunique \"all\" \\
--type \"CDS\" \\
--idattr \"Parent\" \\
--nprocesses 8 \\
--counts_output \"${out/.tsv/-CDS.tsv}\" \\
--with-header \\
\${bams[*]} \\
\"${in}\"
"""
sbatch \
--job-name="htseq-count-CDS" \
--nodes=1 \
--cpus-per-task=8 \
--error="${err_out}-CDS.%A.stderr.txt" \
--output="${err_out}-CDS.%A.stdout.txt" \
htseq-count \
--order "pos" \
--stranded "${hc_strd}" \
--nonunique "all" \
--type "CDS" \
--idattr "Parent" \
--nprocesses 8 \
--counts_output "${out/.tsv/-CDS.tsv}" \
--with-header \
${bams[*]} \
"${in}"
sleep 0.33
done
done
}
# exon ------------------------------------------
run=TRUE
[[ "${run}" == TRUE ]] &&
{
# for i in "strd-eq" "strd-rv"; do
for i in "strd-eq"; do
for k in \
"infiles_gtf-gff3/Trinity-GG/Q_N/trinity-gg_mkc-16_mir-0.05_mg-2_gf-0.05" \
"infiles_gtf-gff3/Trinity-GG/Q_N/trinity-gg_mkc-32_mir-0.05_mg-2_gf-0.05"
do
# i="strd-eq" # echo "${i}"
# k="${stems[7]}" # echo "${k}"
in="$(echo "${k}" | sed 's/infiles/outfiles/g' - ).gffread.gff3" # ., "${in}" # less "${in}" # Approach #2
out="$(echo "${k}" | sed 's/infiles_gtf-gff3/outfiles_htseq-count/g' - ).hc-${i}.tsv" # echo "${out}"
err_out="$(dirname "${out}")/err_out/03-htseq-count-${i}.$(basename "${out}" .tsv)" # echo "${err_out}"
if [[ "${i}" == "strd-eq" ]]; then
hc_strd="yes"
elif [[ "${i}" == "strd-rv" ]]; then
hc_strd="reverse"
fi
echo """
# -------------------------
# Call to htseq-count: exon
# -------------------------
sbatch \\
--job-name=\"htseq-count-exon\" \\
--nodes=1 \\
--cpus-per-task=8 \\
--error=\"${err_out}-exon.%A.stderr.txt\" \\
--output=\"${err_out}-exon.%A.stdout.txt\" \\
htseq-count \\
--order \"pos\" \\
--stranded \"${hc_strd}\" \\
--nonunique \"all\" \\
--type \"exon\" \\
--idattr \"Parent\" \\
--nprocesses 8 \\
--counts_output \"${out/.tsv/-exon.tsv}\" \\
--with-header \\
\${bams[*]} \\
\"${in}\"
"""
sbatch \
--job-name="htseq-count-exon" \
--nodes=1 \
--cpus-per-task=8 \
--error="${err_out}-exon.%A.stderr.txt" \
--output="${err_out}-exon.%A.stdout.txt" \
htseq-count \
--order "pos" \
--stranded "${hc_strd}" \
--nonunique "all" \
--type "exon" \
--idattr "Parent" \
--nprocesses 8 \
--counts_output "${out/.tsv/-exon.tsv}" \
--with-header \
${bams[*]} \
"${in}"
sleep 0.33
done
done
}
}
Proceed to work_assessment-processing_gtfs_part-1.Rmd