-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathread_clean.sh
More file actions
89 lines (70 loc) · 2.59 KB
/
read_clean.sh
File metadata and controls
89 lines (70 loc) · 2.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/bin/zsh
# Ensure the script stops on errors
set -e
# Display usage information
usage() {
echo "Usage: $0 -i <input_dir> -o <output_dir>"
echo "Note: Input directory must contain files in .fastq.gz format."
exit 1
}
# Parse command-line arguments
while getopts "i:o:" opt; do
case $opt in
i) input_dir="$OPTARG" ;;
o) output_dir="$OPTARG" ;;
*) usage ;;
esac
done
# Check if both input and output directories are provided
if [ -z "$input_dir" ] || [ -z "$output_dir" ]; then
usage
fi
# Ensure input directory exists
if [ ! -d "$input_dir" ]; then
echo "Error: Input directory '$input_dir' does not exist."
exit 1
fi
# Check if there are .fastq.gz files in the input directory
if ! ls "$input_dir"/*.fastq.gz >/dev/null 2>&1; then
echo "Error: No files with .fastq.gz format found in '$input_dir'."
echo "Please ensure the input directory contains files in the correct format."
exit 1
fi
# Use source to initialize conda with .zshrc
source ~/.zshrc
# Create and activate conda environment
CONDA_SUBDIR=osx-64 conda create -n rc_2 -y
conda activate rc_2
# Install required software in the conda environment
conda install -c bioconda -c conda-forge entrez-direct sra-tools fastp pigz tree -y
# Create output directory if it doesn't exist
mkdir -p "$output_dir"
# Loop through all FASTQ files in the input directory
for file in "$input_dir"/*_R1_001.fastq.gz; do
# Get the base name of the file
base=$(basename "$file" _R1_001.fastq.gz)
# Define input and output file names
read1="$input_dir/${base}_R1_001.fastq.gz"
read2="$input_dir/${base}_R2_001.fastq.gz"
output_read1="$output_dir/${base}_clean_R1_001.fastq.gz"
output_read2="$output_dir/${base}_clean_R2_001.fastq.gz"
out_uread1="$output_dir/${base}_clean_R1_001_unpaired.fastq.gz"
out_uread2="$output_dir/${base}_clean_R2_001_unpaired.fastq.gz"
# Define log file names
stdout_log="${output_dir}/${base}_fastp_std.out.txt"
stderr_log="${output_dir}/${base}_fastp_std.err.txt"
# Run fastp for quality control and log outputs
fastp -i "$read1" -I "$read2" -o "$output_read1" -O "$output_read2" \
--unpaired1 "$out_uread1" --unpaired2 "$out_uread2" \
--trim_front1 15 --trim_front2 15 \
-l 50 --average_qual 30 --trim_poly_x \
-h "$output_dir/${base}_fastp.html" \
-j "$output_dir/${base}_fastp.json" \
> "$stdout_log" 2> "$stderr_log"
# Combine unpaired reads into a single file
cat "$out_uread1" "$out_uread2" > "$output_dir/${base}_singletons.fastq.gz"
# Remove unpaired files after combining them
rm "$out_uread1" "$out_uread2"
done
# Deactivate conda environment
conda deactivate