-
Notifications
You must be signed in to change notification settings - Fork 10
Expand file tree
/
Copy pathTIGAR_GReX_Pred.sh
More file actions
executable file
·154 lines (126 loc) · 5.39 KB
/
TIGAR_GReX_Pred.sh
File metadata and controls
executable file
·154 lines (126 loc) · 5.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/usr/bin/bash
#######################################################################
### Input Arguments for GReX Prediction
#######################################################################
# --chr: Chromosome number need to be specified with respect to the genotype input data
# --weight: Path for SNP weight (eQTL effect size) file
# --gene_anno : Path for gene annotation file to specify a list of gene for GReX prediction
# `--gene_anno`: Gene annotation file to specify the list of genes, which is of the same format as the first five columsn of gene expression file
# --test_sampleID: Path for a file with sampleIDs that should be contained in the genotype file
# --genofile: Path for the training genotype file (bgzipped and tabixed)
# --genofile_tye: Genotype file type: "vcf" or "dosage"
# --format: Genotype format in VCF file that should be used: "GT" (default) for genotype data or "DS" for dosage data, only required if the input genotype file is of VCF file
# --window: Window size around gene region for selecting cis-SNPs for fitting gene expression prediction model (default 1000000 for +- 1MB region around gene)
# maf_diff: MAF difference threshold for matching SNPs from eQTL weight file and test genotype file. If SNP MAF difference is greater than maf_diff (default 0.2), , the SNP will be excluded
# --TIGAR_dir : Specify the directory of TIGAR source code
# --thread: Number of threads for parallel computation (default 1)
# --out_dir: Output directory (will be created if not exist)
#######################################################################
VARS=`getopt -o "" -a -l \
chr:,weight:,gene_anno:,genofile_type:,genofile:,test_sampleID:,format:,window:,missing_rate:,maf_diff:,TIGAR_dir:,thread:,sub_dir:,out_pred_file:,out_prefix:,log_file:,out_dir: \
-- "$@"`
if [ $? != 0 ]
then
echo "Terminating....." >&2
exit 1
fi
eval set -- "$VARS"
while true
do
case "$1" in
--chr|-chr) chr=$2; shift 2;;
--weight|-weight) weight_file=$2; shift 2;;
--gene_anno|-gene_anno) gene_anno=$2; shift 2;;
--genofile_type|-genofile_type) genofile_type=$2; shift 2;;
--genofile|-genofile) genofile=$2; shift 2;;
--test_sampleID|-test_sampleID) test_sampleID_file=$2; shift 2;;
--format|-format) format=$2; shift 2;;
--window|-window) window=$2; shift 2;;
--missing_rate|-missing_rate) missing_rate=$2; shift 2;;
--maf_diff|-maf_diff) maf_diff=$2; shift 2;;
--TIGAR_dir|-TIGAR_dir) TIGAR_dir=$2; shift 2;;
--thread|-thread) thread=$2; shift 2;;
--sub_dir|-sub_dir) sub_dir=$2; shift 2;;
--out_prefix|-out_prefix) out_prefix=$2; shift 2;;
--out_pred_file|-out_pred_file) out_pred_file=$2; shift 2;;
--log_file|-log_file) log_file=$2; shift 2;;
--out_dir|-out_dir) out_dir=$2; shift 2;;
--) shift;break;;
*) echo "Internal error!";exit 1;;
esac
done
#### default value
window=${window:-$((10**6))}
missing_rate=${missing_rate:-0.2}
maf_diff=${maf_diff:-0.2}
thread=${thread:-1}
out_prefix=${out_prefix:-CHR${chr}_Pred}
out_pred_file=${out_pred_file:-${out_prefix}_GReX.txt}
log_file=${log_file:-${out_prefix}_log.txt}
# sub_dir: whether to use subdirectory inside out_dir for output files
sub_dir=${sub_dir:-1}
#### Create output directory if not existed
mkdir -p ${out_dir}
mkdir -p ${out_dir}/logs
# sub directory in out directory
if [[ "$sub_dir"x == "1"x ]];then
out_sub_dir=${out_dir}/Pred_CHR${chr}
else
out_sub_dir=${out_dir}
fi
mkdir -p ${out_sub_dir}
####################################################
# check tabix command
if [ ! -x "$(command -v tabix)" ]; then
echo 'Error: required tool TABIX is not available.' >&2
exit 1
fi
# Check genotype file
if [ ! -f "${genofile}" ]; then
echo Error: Training genotype file ${genofile} does not exist or is empty. >&2
exit 1
# check chromosome column to see if it starts with "chr"; will cause "No tabix data" error for all genes
startswithchr=`zgrep -E '^#' -v ${genofile} | head -n10 | grep -E '^chr' -c`
# warn but don't stop job?
if ((startswithchr != 0)); then
echo Error: A check of the first 10 non-header lines in your genotype file shows that CHROM column values seem to start with \"chr\"! >&2
exit 1
fi
fi
# Check training sample ID file
if [ ! -f "${test_sampleID_file}" ] ; then
echo Error: Test sample ID file ${test_sampleID_file} does not exist or is empty. >&2
exit 1
fi
# Check eQTL weight file
if [ ! -f "${weight_file}" ] ; then
echo Error: eQTL weight file ${weight_file} does not exist or is empty. >&2
exit 1
fi
# Check gene annotation file
if [ ! -f "${gene_anno}" ] ; then
echo Error: gene info file ${gene_anno} does not exist or is empty. >&2
exit 1
fi
# Make python script executible
if [[ ! -x ${TIGAR_dir}/Model_Train_Pred/Prediction.py ]] ; then
chmod 755 ${TIGAR_dir}/Model_Train_Pred/Prediction.py
fi
echo Predicting gene expression.
python ${TIGAR_dir}/Model_Train_Pred/Prediction.py \
--chr ${chr} \
--weight ${weight_file} \
--genofile ${genofile} \
--test_sampleID ${test_sampleID_file} \
--format ${format} \
--genofile_type ${genofile_type} \
--gene_anno ${gene_anno} \
--window ${window} \
--thread ${thread} \
--maf_diff ${maf_diff} \
--missing_rate ${missing_rate} \
--out_pred_file ${out_pred_file} \
--TIGAR_dir ${TIGAR_dir} \
--out_dir ${out_sub_dir} \
> ${out_dir}/logs/${log_file}
echo Completed prediction.