-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathsra_data_downloader.sh
158 lines (106 loc) · 4.49 KB
/
sra_data_downloader.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/bin/bash
#WARNING: The author is not responsible for anything that happens as a result of using this script
help_=$1
if [ "$help_" == "help" ] || [ "$help_" == "--help" ]
then
echo "welcome to the sra data downloader wrapper script"
echo "script usage:"
echo " ./sra_data_downloader.sh <id_file> b <no_of_files_per_batch> < <fasterq-dump options>"
echo " or "
echo " ./sra_data_downloader.sh <id_file> p <no_of_jobs> <fasterq-dump options>"
echo " "
echo " <id_file>: is a textfile containing accession numbers of the data to be downloaded."
echo " b: is an indication that the script should download the data in batches."
echo " <no_of_files_per_batch>: indicates the number of files to download per batch."
echo " this should be an integer value."
echo " eg. if no_of_files_per_batch is 5, every batch to be downloaded will"
echo " will comprise of 5 files."
echo " p: indicates that GNU parallel should be used for downloading the data."
echo " the files to be downloaded will be distributed among the cpus."
echo " <no_of_jobs>: this indicates the number of jobs to be distributed."
echo " eg. if no_of_jobs of 5 means that the task will be split in 5"
echo " and distributed among the cpus on the system"
echo " this should be an integer value."
echo " <fasterq-dump options>: contains fasterq-dump options to be used."
echo " to learn more about the options, please refer to the sratoolkit wiki."
echo " fasterq-dump options should be separated with a space character."
echo " "
echo "example usage:"
echo " ./sra_data_downloader.sh textfile.txt"
echo " ./sra_data_downloader.sh textfile.txt b 4 --skip-technical --split-files"
echo " ./sra_data_downloader.sh textfile.txt p 4 --skip-technical"
exit
else
set -e
#get current working directory
cwd=$(pwd)
#get all bash arguments
allargs=("$@")
#get the text file that contains the accesssion numbers
textfile=$1
#get the accession numbers in the text file
ids=($(cat "$textfile"))
if [ "$2" == "b" ]
then
nfiles=$3
#if [ ! -n "$3" ]
if ! echo "$3" | grep -qE '^[0-9]+$';
then
echo "please specify the number of files per batch."
echo "the value should be an integer."
echo "$nfiles files was specified. this is a wrong entry"
exit
else
number_of_files_per_batch="$3"
fasterqdump_options=(${allargs[@]:3})
fi
elif [ "$2" == "p" ]
then
batch_downloader="$cwd"/batch_downloader.txt
njobs="$3"
if ! echo "$3" | grep -qE '^[0-9]+$';
then
echo "please specify the number of jobs."
echo "the value should be an integer."
echo "\"$njobs\" jobs was specified. this is a wrong entry"
exit
else
fasterqdump_options=(${allargs[@]:3})
echo "number of jobs: $njobs "
foptions=$(IFS=" " ; echo "${fasterqdump_options[*]}")
echo "id_file: $textfile"
echo "fasterq-dump options: $foptions"
if [ -f "$batch_downloader" ]
then
rm "$batch_downloader"
fi
for id in ${ids[@]}
do
echo $"fasterq-dump $id $foptions">>"$batch_downloader"
done
parallel --jobs "$njobs" < "$batch_downloader"
rm "$batch_downloader"
exit
fi
else
declare -i number_of_files_per_batch=1
fasterqdump_options=(${allargs[@]:1})
#fasterqdump_options=$(echo "${@:2}")
fi
echo "number of files per batch: $number_of_files_per_batch"
foptions=$(IFS=" " ; echo "${fasterqdump_options[*]}")
echo "id_file: $textfile"
echo "fasterq-dump options: $foptions"
b="$number_of_files_per_batch"
for((i=0; i < ${#ids[@]}; i+=b))
do
part=( "${ids[@]:i:b}" )
allids=$(IFS=" " ; echo "${part[*]}")
batch_ids="$allids "
echo "Downloading $batch_ids"
cmd="fasterq-dump $batch_ids $foptions"
echo $cmd
eval $cmd
#fasterq-dump ${batch_ids} ${foptions}
done
fi