Skip to content

Latest commit

 

History

History
360 lines (308 loc) · 11.2 KB

work_representative-non-coding-transcriptome_part-5.md

File metadata and controls

360 lines (308 loc) · 11.2 KB

work_representative-non-coding-transcriptome_part-5.md

Table of Contents
  1. Get situated
    1. Code
  2. Run htseq-count on bams in bams_renamed/ with .gtfs in outfiles_gtf-gff3/representation
    1. Set up outfile directories
      1. Code
    2. Set up arrays of bams
      1. Code
    3. Index bams
      1. Code
    4. Run htseq-count with .gtfs in outfiles_gtf-gff3/representation
      1. Set up necessary arrays, variables
        1. Code
      2. Set up and submit htseq-count jobs
        1. Code

Get situated

Code

Code: Get situated
#!/bin/bash

# tmux new -s htseq
# tmux a -t htseq

transcriptome && 
    {
        cd "results/2023-0215/" \
            || echo "cd'ing failed; check on this..."
    }

source activate gff3_env


Run htseq-count on bams in bams_renamed/ with .gtfs in outfiles_gtf-gff3/representation

Set up outfile directories

Code

Code: Set up outfile directories
#!/bin/bash

for h in ./outfiles_htseq-count/representation/UT_prim_UMI/*; do
    if [[ ! -e "${h}" ]]; then
        mkdir -p outfiles_htseq-count/representation/UT_prim_UMI/err_out
    else
        echo "Directories present; skipping mkdir'ing of outfile directories"
    fi

    break
done

Set up arrays of bams

Code

Code: Set up arrays of bams
#!/bin/bash

unset UT_prim_UMI
typeset -a UT_prim_UMI
while IFS=" " read -r -d $'\0'; do
    UT_prim_UMI+=( "${REPLY}" )
done < <(\
    find "bams_renamed/UT_prim_UMI" \
        -type l \
        -name "*.bam" \
        -print0 \
            | sort -z \
)
echo_test "${UT_prim_UMI[@]}"
echo "${#UT_prim_UMI[@]}"
echo "${UT_prim_UMI[*]}"

Index bams

Code

Code: Index bams
#!/bin/bash

for h in ./bams_renamed/UT_prim_UMI/*.bai; do
    if [[ ! -e "${h}" ]]; then
        ml SAMtools/1.16.1-GCC-11.2.0

        for i in "${UT_prim_UMI[@]}"; do
                echo "${i}"
                samtools index -@ "${SLURM_CPUS_ON_NODE}" "${i}"

            module purge SAMtools/1.16.1-GCC-11.2.0
        done
    else
        echo "Bam indices exist; skipping the running of samtools index"
    fi

    break
done

Run htseq-count with .gtfs in outfiles_gtf-gff3/representation

Set up necessary arrays, variables

Code
Code: Set up necessary variables
#!/bin/bash

p_gtf=outfiles_gtf-gff3/representation  # ls -1 "${p_gtf}"

full=FALSE  # echo "${full}"
if [[ "${full}" == TRUE ]]; then
    gtf=(
        "${p_gtf}/Greenlaw-et-al_CUTs-4x.gtf"
        "${p_gtf}/Greenlaw-et-al_CUTs.gtf"
        "${p_gtf}/Greenlaw-et-al_CUTs-HMM.gtf"
        "${p_gtf}/Greenlaw-et-al_ncRNAs.gtf"
        "${p_gtf}/Greenlaw-et-al_non-collapsed-non-coding-transcriptome.gtf"
        "${p_gtf}/Greenlaw-et-al_NUTs.gtf"
        "${p_gtf}/Greenlaw-et-al_representative-non-coding-transcriptome.gtf"
        "${p_gtf}/Greenlaw-et-al_SRATs.gtf"
        "${p_gtf}/Greenlaw-et-al_SUTs.gtf"
        "${p_gtf}/Greenlaw-et-al_XUTs.gtf"
    )
    idattr="gene_id"  # echo "${idattr}"
else
    gtf=( "${p_gtf}/Greenlaw-et-al_non-collapsed-non-coding-transcriptome.gtf" )
    idattr="complete"  # echo "${idattr}"
fi

echo_test "${gtf[@]}"
echo "${#gtf[@]}"

job_name="run_htseq-count"  # echo "${job_name}"
threads=12  # echo "${threads}"

# echo_test "${UT_prim_UMI[@]}"
# echo "${#UT_prim_UMI[@]}"

Set up and submit htseq-count jobs

Code
Code: Set up and submit htseq-count jobs
#!/bin/bash

#  Run echo tests for calls to htseq-count ------------------------------------
run=TRUE
[[ ${run} == TRUE ]] &&
    {
        h=0
        # for i in "strd-eq" "strd-rv"; do
        for i in "strd-eq"; do
            for j in "${gtf[@]}"; do
                # i="strd-eq"  # echo "${i}"
                # j="${gtf[3]}"  # echo "${j}"

                #  -------------------------------------
                count_against="${j}"  # echo "${count_against}"
                out="outfiles_htseq-count/representation/UT_prim_UMI/$(
                    echo $(basename "${count_against}") \
                        | sed 's/Greenlaw-et-al_//g;s/.gtf//g'
                ).hc-${i}.tsv"   # echo "${out}"  # ., "$(dirname "${out}")"

                err_out="$(
                    dirname "${out}"
                )/err_out/$(
                    basename "${out}" .tsv
                )"  # echo "${err_out}"  # ., "$(dirname "${err_out}")"


                #  -------------------------------------
                let h++
                iter="${h}"
                echo "        #  -------------------------------------"
                printf "        Iteration '%d'\n" "${iter}"

                echo """
                Running htseq-count
                            directory                                                file
                       out  $(dirname ${out})          $(basename ${out})
                    stdout  $(dirname ${err_out})  $(basename ${err_out}).stdout.txt
                    stderr  $(dirname ${err_out})  $(basename ${err_out}).stderr.txt
                """

                if [[ "${i}" == "strd-eq" ]]; then
                    hc_strd="yes"  # echo "${hc_strd}"
                elif [[ "${i}" == "strd-op" ]]; then
                    hc_strd="reverse"  # echo "${hc_strd}"
                fi


                #  -------------------------------------
                echo """
                sbatch \\
                    --job-name=${job_name} \\
                    --nodes=1 \\
                    --cpus-per-task=${threads} \\
                    --error=${err_out}.%A.stderr.txt \\
                    --output=${err_out}.%A.stdout.txt \\
                    htseq-count \\
                        --order \"pos\" \\
                        --stranded \"${hc_strd}\" \\
                        --nonunique \"none\" \\
                        --type \"feature\" \\
                        --idattr \"${idattr}\" \\
                        --nprocesses ${threads} \\
                        --counts_output \"${out}\" \\
                        --with-header \\
                        ${UT_prim_UMI[*]} \\
                        \"${count_against}\" \\
                             > >(tee -a \"${err_out}.stdout.txt\") \\
                            2> >(tee -a \"${err_out}.stderr.txt\")
                """
            done
        done
    }


#  Run actual calls to htseq-count --------------------------------------------
run=FALSE
[[ ${run} == TRUE ]] &&
    {
        h=0
        # for i in "strd-eq" "strd-rv"; do
        for i in "strd-eq"; do
            for j in "${gtf[@]}"; do
                # i="strd-eq"  # echo "${i}"
                # j="${gtf[3]}"  # echo "${j}"

                #  -------------------------------------
                count_against="${j}"  # echo "${count_against}"
                out="outfiles_htseq-count/representation/UT_prim_UMI/$(
                    echo $(basename "${count_against}") \
                        | sed 's/Greenlaw-et-al_//g;s/.gtf//g'
                ).hc-${i}.tsv"   # echo "${out}"  # ., "$(dirname "${out}")"

                err_out="$(
                    dirname "${out}"
                )/err_out/$(
                    basename "${out}" .tsv
                )"  # echo "${err_out}"  # ., "$(dirname "${err_out}")"


                #  -------------------------------------
                let h++
                iter="${h}"
                echo "        #  -------------------------------------"
                printf "        Iteration '%d'\n" "${iter}"

                echo """
                Running htseq-count
                            directory                                                file
                       out  $(dirname ${out})          $(basename ${out})
                    stdout  $(dirname ${err_out})  $(basename ${err_out}).stdout.txt
                    stderr  $(dirname ${err_out})  $(basename ${err_out}).stderr.txt
                """

                if [[ "${i}" == "strd-eq" ]]; then
                    hc_strd="yes"  # echo "${hc_strd}"
                elif [[ "${i}" == "strd-op" ]]; then
                    hc_strd="reverse"  # echo "${hc_strd}"
                fi


                #  -------------------------------------
                echo """
                sbatch \\
                    --job-name=${job_name} \\
                    --nodes=1 \\
                    --cpus-per-task=${threads} \\
                    --error=${err_out}.%A.stderr.txt \\
                    --output=${err_out}.%A.stdout.txt \\
                    htseq-count \\
                        --order \"pos\" \\
                        --stranded \"${hc_strd}\" \\
                        --nonunique \"none\" \\
                        --type \"feature\" \\
                        --idattr \"${idattr}\" \\
                        --nprocesses ${threads} \\
                        --counts_output \"${out}\" \\
                        --with-header \\
                        ${UT_prim_UMI[*]} \\
                        \"${count_against}\" \\
                             > >(tee -a \"${err_out}.stdout.txt\") \\
                            2> >(tee -a \"${err_out}.stderr.txt\")
                """

                sbatch \
                    --job-name=${job_name} \
                    --nodes=1 \
                    --cpus-per-task=${threads} \
                    --error=${err_out}.%A.stderr.txt \
                    --output=${err_out}.%A.stdout.txt \
                    htseq-count \
                        --order "pos" \
                        --stranded "${hc_strd}" \
                        --nonunique "none" \
                        --type "feature" \
                        --idattr "${idattr}" \
                        --nprocesses ${threads} \
                        --counts_output "${out}" \
                        --with-header \
                        ${UT_prim_UMI[*]} \
                        "${count_against}"

                sleep 0.5
                echo ""
            done
        done
    }