-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathmancdb
More file actions
executable file
·155 lines (134 loc) · 4.42 KB
/
mancdb
File metadata and controls
executable file
·155 lines (134 loc) · 4.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/usr/bin/env bash
# make sure these tools are available in system path
# hmmpress http://hmmer.org/
MANCDB_VERSION=1.0
## subroutine for help message
mancdb_usage() {
echo ""
echo " Program: mancdb (curate OatkDB gene HMM profile database)"
echo " Version: ${MANCDB_VERSION}"
echo ""
echo " Usage: mancdb [options] gene_list tmpdir output"
echo " Optional:"
echo " -f|--force Overwrite existing files."
echo " -s|--sort Sort gene by names."
echo " --log STR Log file (default stdout)."
echo " -h|--help Print this help message."
echo " -v|--version Print version number."
echo ""
echo " Example: mancdb gene_list tmpdir angiosperms_pltd_v$(date +%Y%m%d)"
echo ""
}
## subroutine for tool availability checking
check_tool() {
tool=$1; shift
site=$1; shift
if ! command -v ${tool} &> /dev/null
then
>&2 echo "Command '${tool}' not found"
>&2 echo "It can be installed from ${site}"
>&2 echo "If it has been installed, add it to system path"
exit 1
fi
}
## subroutine for logging
info_message() {
message=$1; shift
dest=$1; shift
if [[ -z ${dest} ]]; then dest=${LOGFILE}; fi
echo "[$(date +'%Y-%m-%d::%H:%M:%S')] ${message}" >>${dest}
}
## check availabilities of dependencies
check_tool hmmpress http://hmmer.org/
## parameter declaration
GENLIST=""
TEMPDIR=""
OUTPUT=""
FORCE=0
SORTN=0
LOGFILE="/dev/stdout"
HELP=0
VERSION=0
while [[ ${#} -gt 0 ]]
do
key=${1}
case ${key} in
-f|--force ) FORCE=1; shift;;
-s|--sort ) SORTN=1; shift;;
--log ) LOGFILE=${2}; shift; shift;;
-h|--help ) HELP=1; shift;;
-v|--version ) VERSION=1; shift;;
-*|--*= )
>&2 echo "ERROR: unknown options ${1}"
>&2 mancdb_usage
exit 1;;
* )
GENLIST=${1}; shift;
TEMPDIR=${1}; shift;
OUTPUT=${1}; shift;
esac
done
if [[ ${HELP} -eq 1 ]]; then >&1 mancdb_usage; exit 0; fi
if [[ ${VERSION} -eq 1 ]]; then >&1 echo ${MANCDB_VERSION}; exit 0; fi
if [[ -z ${GENLIST} ]] || [[ -z ${TEMPDIR} ]] || [[ -z ${OUTPUT} ]]
then
>&2 echo "ERROR: requires at least THREE positional parameters"
>&2 mancdb_usage
exit 1
fi
if [[ -f ${OUTPUT}.fam ]] && [[ ${FORCE} -eq 0 ]]
then
>&2 echo "ERROR: database file '${OUTPUT}.fam' exists"
>&2 echo " provide -f|--force option to overwrite"
exit 1
fi
rm -f ${OUTPUT}.fam ${OUTPUT}.fam.h3f ${OUTPUT}.fam.h3i ${OUTPUT}.fam.h3m ${OUTPUT}.fam.h3p || \
(>&2 echo "ERROR: existing profile database files are not overwritable"; exit 1)
if [[ ! -f ${GENLIST} ]]
then
>&2 echo "ERROR: gene list file '${GENLIST}' not found"
exit 1
fi
if [[ ! -d ${TEMPDIR} ]]
then
>&2 echo "ERROR: temp dir '${TEMPDIR}' not found"
exit 1
fi
outdir=$(dirname ${OUTPUT})
mkdir -p ${outdir} 2>/dev/null || \
(>&2 echo "ERROR: create output dir '${outdir}' failed"; exit 1)
hmmDBName="${TEMPDIR}/DB.curated.fam"
hmmDBSummary="${TEMPDIR}/DB.curated.summary"
geneSeqDir="${TEMPDIR}/GeneSeqs"
if [[ ! -d ${geneSeqDir} ]]
then
>&2 echo "ERROR: gene seq dir '${geneSeqDir}' not found"
exit 1
fi
rm -f ${hmmDBName} ${hmmDBSummary}
nSEQ=$(cat ${GENLIST} | grep -v "^#" | wc -l)
nGEN=0
sortByName="cat"
if [[ ${SORTN} -eq 1 ]]; then sortByName="sort -k1,1V -k3,3V"; fi
info_message "Starting HMM profile database curation."
info_message " Collecting individual HMM profile for each gene."
echo -e "#Type\tGene\tName\tNSEQ" >${hmmDBSummary}
while read -r gType gId gName gCount
do
hmmfile="${geneSeqDir}/${gType}/${gId}.clean.2.hmm"
if [[ -z ${gName} ]]; then gName=gId; fi
if [[ -f ${hmmfile} ]]
then
cat ${hmmfile} | sed -E "s/(^NAME\s+)([^\s]+$)/\1${gName}/" >> ${hmmDBName}
echo -e "${gType}\t${gId}\t${gName}\t$(cat ${hmmfile} | grep ^NSEQ | awk '{print $2}')" >>${hmmDBSummary}
nGEN=$((nGEN+1))
fi
done < <(cat ${GENLIST} | grep -v "^#" | head -${nSEQ} | ${sortByName})
info_message " [${nGEN}] genes collected."
info_message " Collecting individual HMM profiles DONE."
info_message " Making HMM database."
if [[ ! -f ${OUTPUT}.fam ]]; then cp ${hmmDBName} ${OUTPUT}.fam; fi
hmmpress ${OUTPUT}.fam 1>/dev/null
info_message " Making HMM database DONE."
info_message "HMM profile database curation DONE."
info_message "All DONE."