#work_combine-gtfs_processed-pa-ncRNA_part-1.md
Table of Contents
Code: Get situated
#!/bin/bash
# tmux new -s htseq
# tmux a -t htseq
transcriptome &&
{
cd "results/2023-0215/" \
|| echo "cd'ing failed; check on this..."
}
source activate gff3_env
...using bams in bams_renamed/
with the gtf
in outfiles_gtf-gff3/representation
Code: Set up outfile directories
#!/bin/bash
for h in ./outfiles_htseq-count/representation/UT_prim_UMI/*; do
if [[ ! -e "${h}" ]]; then
mkdir -p outfiles_htseq-count/representation/UT_prim_UMI/err_out
else
echo "Directories present; skipping mkdir'ing of outfile directories"
fi
break
done
Code: Set up arrays of bams
#!/bin/bash
unset UT_prim_UMI
typeset -a UT_prim_UMI
while IFS=" " read -r -d $'\0'; do
UT_prim_UMI+=( "${REPLY}" )
done < <(\
find "bams_renamed/UT_prim_UMI" \
-type l \
-name "*.bam" \
-print0 \
| sort -z \
)
echo_test "${UT_prim_UMI[@]}"
echo "${#UT_prim_UMI[@]}"
echo "${UT_prim_UMI[*]}"
Code: Index bams
#!/bin/bash
for h in ./bams_renamed/UT_prim_UMI/*.bai; do
if [[ ! -e "${h}" ]]; then
ml SAMtools/1.16.1-GCC-11.2.0
for i in "${UT_prim_UMI[@]}"; do
echo "${i}"
samtools index -@ "${SLURM_CPUS_ON_NODE}" "${i}"
module purge SAMtools/1.16.1-GCC-11.2.0
done
else
echo "Bam indices exist; skipping the running of samtools index"
fi
break
done
Code: Set up necessary variables
#!/bin/bash
p_gtf=outfiles_gtf-gff3/representation # ls -1 "${p_gtf}"
gtf="${p_gtf}/Greenlaw-et-al_representative-coding-pa-ncRNA-transcriptome.gtf" # echo "${gtf}" # ., "${gtf}"
unset nonunique
typeset -a nonunique=( "none" "all" "fraction" "random" )
# echo_test "${nonunique[@]}"
job_name="run_htseq-count" # echo "${job_name}"
threads=12 # echo "${threads}"
# echo_test "${UT_prim_UMI[@]}"
# echo "${#UT_prim_UMI[@]}"
Code: Set up and submit htseq-count jobs
#!/bin/bash
# Run echo tests for calls to htseq-count ------------------------------------
run=TRUE
[[ ${run} == TRUE ]] &&
{
h=0
# for i in "strd-eq" "strd-rv"; do
for i in "strd-eq"; do
# for j in "${gtf[@]}"; do
for j in "${gtf}"; do
for k in "${nonunique[@]}"; do
# i="strd-eq" # echo "${i}"
# j="${gtf}" # echo "${j}"
# k="${nonunique[1]}" # echo "${k}"
# -------------------------------------
count_against="${j}" # echo "${count_against}"
handle_gt_one="${k}" # echo "${handle_gt_one}"
out="outfiles_htseq-count/representation/UT_prim_UMI/$(
echo $(basename "${count_against}") \
| sed 's/Greenlaw-et-al_//g;s/.gtf//g'
).hc-${i}.union-${handle_gt_one}.tsv"
# echo "${out}" # ., "$(dirname "${out}")"
err_out="$(
dirname "${out}"
)/err_out/$(
basename "${out}" .tsv
)"
# echo "${err_out}" # ., "$(dirname "${err_out}")"
# -------------------------------------
let h++
iter="${h}"
echo " # -------------------------------------"
printf " Iteration '%d'\n" "${iter}"
echo """
Running htseq-count
directory file
out $(dirname ${out}) $(basename ${out})
stdout $(dirname ${err_out}) $(basename ${err_out}).stdout.txt
stderr $(dirname ${err_out}) $(basename ${err_out}).stderr.txt
"""
if [[ "${i}" == "strd-eq" ]]; then
hc_strd="yes" # echo "${hc_strd}"
elif [[ "${i}" == "strd-op" ]]; then
hc_strd="reverse" # echo "${hc_strd}"
fi
# -------------------------------------
echo """
sbatch \\
--job-name=${job_name} \\
--nodes=1 \\
--cpus-per-task=${threads} \\
--error=${err_out}.%A.stderr.txt \\
--output=${err_out}.%A.stdout.txt \\
htseq-count \\
--order \"pos\" \\
--stranded \"${hc_strd}\" \\
--nonunique \"${handle_gt_one}\" \\
--type \"feature\" \\
--idattr \"gene_id\" \\
--nprocesses ${threads} \\
--counts_output \"${out}\" \\
--with-header \\
${UT_prim_UMI[*]} \\
\"${count_against}\" \\
> >(tee -a \"${err_out}.stdout.txt\") \\
2> >(tee -a \"${err_out}.stderr.txt\")
"""
done
done
done
}
# Run actual calls to htseq-count --------------------------------------------
run=TRUE
[[ ${run} == TRUE ]] &&
{
h=0
# for i in "strd-eq" "strd-rv"; do
for i in "strd-eq"; do
# for j in "${gtf[@]}"; do
for j in "${gtf}"; do
for k in "${nonunique[@]}"; do
# i="strd-eq" # echo "${i}"
# j="${gtf}" # echo "${j}"
# k="${nonunique[1]}" # echo "${k}"
# -------------------------------------
count_against="${j}" # echo "${count_against}"
handle_gt_one="${k}" # echo "${handle_gt_one}"
out="outfiles_htseq-count/representation/UT_prim_UMI/$(
echo $(basename "${count_against}") \
| sed 's/Greenlaw-et-al_//g;s/.gtf//g'
).hc-${i}.union-${handle_gt_one}.tsv"
# echo "${out}" # ., "$(dirname "${out}")"
err_out="$(
dirname "${out}"
)/err_out/$(
basename "${out}" .tsv
)"
# echo "${err_out}" # ., "$(dirname "${err_out}")"
# -------------------------------------
let h++
iter="${h}"
echo " # -------------------------------------"
printf " Iteration '%d'\n" "${iter}"
echo """
Running htseq-count
directory file
out $(dirname ${out}) $(basename ${out})
stdout $(dirname ${err_out}) $(basename ${err_out}).stdout.txt
stderr $(dirname ${err_out}) $(basename ${err_out}).stderr.txt
"""
if [[ "${i}" == "strd-eq" ]]; then
hc_strd="yes" # echo "${hc_strd}"
elif [[ "${i}" == "strd-op" ]]; then
hc_strd="reverse" # echo "${hc_strd}"
fi
# -------------------------------------
echo """
sbatch \\
--job-name=${job_name} \\
--nodes=1 \\
--cpus-per-task=${threads} \\
--error=${err_out}.%A.stderr.txt \\
--output=${err_out}.%A.stdout.txt \\
htseq-count \\
--order \"pos\" \\
--stranded \"${hc_strd}\" \\
--nonunique \"${handle_gt_one}\" \\
--type \"feature\" \\
--idattr \"gene_id\" \\
--nprocesses ${threads} \\
--counts_output \"${out}\" \\
--with-header \\
${UT_prim_UMI[*]} \\
\"${count_against}\" \\
> >(tee -a \"${err_out}.stdout.txt\") \\
2> >(tee -a \"${err_out}.stderr.txt\")
"""
sbatch \
--job-name=${job_name} \
--nodes=1 \
--cpus-per-task=${threads} \
--error=${err_out}.%A.stderr.txt \
--output=${err_out}.%A.stdout.txt \
htseq-count \
--order "pos" \
--stranded "${hc_strd}" \
--nonunique "${handle_gt_one}" \
--type "feature" \
--idattr "gene_id" \
--nprocesses ${threads} \
--counts_output "${out}" \
--with-header \
${UT_prim_UMI[*]} \
"${count_against}"
sleep 0.5
echo ""
done
done
done
}