Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions tools/repeatmasker/.lint_skip

This file was deleted.

2 changes: 1 addition & 1 deletion tools/repeatmasker/macros.xml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<macros>
<token name="@TOOL_VERSION@">4.1.5</token>
<token name="@TOOL_VERSION@">4.1.7</token>
<token name="@VERSION_SUFFIX@">galaxy0</token>
<xml name="edam_ontology">
<edam_topics>
Expand Down
89 changes: 59 additions & 30 deletions tools/repeatmasker/repeatmasker.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3,31 +3,38 @@
<macros>
<import>macros.xml</import>
</macros>
<expand macro='xrefs'/>
<expand macro='edam_ontology' />
<expand macro='xrefs'/>
<expand macro='requirements' />
<version_command>repeatmasker --version</version_command>
<command detect_errors="exit_code"><![CDATA[
RM_PATH=\$(which RepeatMasker) &&
if [ -z "\$RM_PATH" ] ; then echo "Failed to find RepeatMasker in PATH (\$PATH)" >&2 ; exit 1 ; fi &&

if [ -z "\$RM_LIB_PATH" ] ; then RM_LIB_PATH=\$(dirname \$RM_PATH)/../share/RepeatMasker/Libraries ; fi &&
#if $repeat_source.source_type == "dfam_up":
mkdir lib/ &&
ln -s '${repeat_source.dfam_lib}' lib/RepeatMaskerLib.h5 &&
RM_LIB_PATH=\$(pwd)/lib &&

mkdir RM_LIB_PATH &&
for l in \$RM_LIB_PATH/*;
do
ln -s "\$l" RM_LIB_PATH/"\$(basename \$l)";
done &&

#if $repeat_source.source_type == "dfam":
ln -fs '${repeat_source.dfam_lib.fields.path}' RM_LIB_PATH/RepeatMaskerLib.h5 &&
#elif $repeat_source.source_type == "dfam_up":
ln -fs '${repeat_source.dfam_lib}' RM_LIB_PATH/RepeatMaskerLib.h5 &&
#end if

ln -s '${input_fasta}' rm_input.fasta &&

RepeatMasker -dir \$(pwd)
-libdir \$RM_LIB_PATH
-libdir RM_LIB_PATH
#if $repeat_source.source_type == "library":
-lib '${repeat_source.repeat_lib}'
-cutoff '${repeat_source.cutoff}'
#else if $repeat_source.source_type == "dfam":
#if $repeat_source.species_source.species_from_list == 'yes':
-species $repeat_source.species_source.species_list
-species '$repeat_source.species_source.species_list'
#else
-species '${repeat_source.species_source.species_name}'
#end if
Expand Down Expand Up @@ -92,34 +99,38 @@
<option value="library">Custom library of repeats</option>
</param>
<when value="dfam">
<param name="dfam_lib" type="select" label="Dfam">
<options from_data_table="dfam"/>
<validator type="no_options" message="No Dfam datasets installed - please contact your Galaxy admin."/>
</param>
<conditional name="species_source">
<param label="Select species name from a list?" name="species_from_list" type="select">
<option value="yes" selected="true">Yes</option>
<option value="no">No</option>
</param>
<when value="yes">
<param name="species_list" type="select" label="Species">
<option value="human" selected="true">Human (Homo sapiens)</option>
<option value="Homo sapiens" selected="true">Human (Homo sapiens)</option>
<option value="rodent">Rodent (Order Rodentia)</option>
<option value="mouse">Mouse (Mus musculus)</option>
<option value="rattus">Rat (Rattus sp.)</option>
<option value="danio">Danio (zebra fish)</option>
<option value="drosophila">Fruit fly (Drosophila melanogaster)</option>
<option value="elegans">Caenorhabditis elegans (nematode)</option>
<option value="Mus musculus">Mouse (Mus musculus)</option>
<option value="Rattus rattus">Rat (Rattus sp.)</option>
<option value="Danio rerio">Danio (zebra fish)</option>
<option value="Drosophila melanogaster">Fruit fly (Drosophila melanogaster)</option>
<option value="Caenorhabditis elegans">Caenorhabditis elegans (nematode)</option>
</param>
</when>
<when value="no">
<param name="species_name" type="text" value="human" label="Repeat source species" help="Source species (or clade name) used to select repeats from DFam" />
<param name="species_name" type="text" value="Homo sapiens" label="Repeat source species" help="Source species (or clade name) used to select repeats from DFam" />
</when>
</conditional>
</when>
<when value="dfam_up">
<param name="dfam_lib" type="data" format="h5" label="DFam library" help="The full DFam library can be downloaded from https://www.dfam.org/releases/current/families/Dfam.h5.gz" />
<param name="species_name" type="text" value="human" label="Repeat source species" help="Source species (or clade name) used to select repeats from DFam" />
<param name="species_name" type="text" value="Homo sapiens" label="Repeat source species" help="Source species (or clade name) used to select repeats from DFam" />
</when>
<when value="library">
<param name="repeat_lib" type="data" format="fasta" label="Custom library of repeats" />
<param name="cutoff" type="integer" argument="-cutoff" value="225" label="Cutoff score for masking repeats" />
<param type="integer" argument="-cutoff" value="225" label="Cutoff score for masking repeats" />
</when>
</conditional>
<param type="boolean" argument="-gff" truevalue="-gff" falsevalue="" label="Output annotation of repeats in GFF format" checked="false" />
Expand Down Expand Up @@ -178,20 +189,26 @@
<tests>
<test expect_num_outputs="4">
<param name="input_fasta" value="small.fasta" ftype="fasta" />
<param name="source_type" value="library" />
<param name="repeat_lib" value="repeats.fasta" ftype="fasta" />
<conditional name="repeat_source">
<param name="source_type" value="library" />
<param name="repeat_lib" value="repeats.fasta" ftype="fasta" />
</conditional>
<output name="output_masked_genome" file="small.fasta.masked" />
<output name="output_table" file="small.fasta.stats" lines_diff="6" />
<output name="output_repeat_catalog" file="small.fasta.cat" lines_diff="2" />
<output name="output_log" file="small.fasta.log" lines_diff="2"/>
</test>
<test expect_num_outputs="7">
<param name="input_fasta" value="small.fasta" ftype="fasta" />
<param name="source_type" value="library" />
<param name="gff" value="-gff" />
<param name="keep_alignments" value="-ali" />
<param name="poly" value="-poly" />
<param name="repeat_lib" value="repeats.fasta" ftype="fasta" />
<conditional name="repeat_source">
<param name="source_type" value="library" />
<param name="repeat_lib" value="repeats.fasta" ftype="fasta" />
</conditional>
<param name="gff" value="true" />
<section name="advanced">
<param name="keep_alignments" value="true" />
<param name="poly" value="true" />
</section>
<output name="output_masked_genome" file="small.fasta.masked" />
<output name="output_table" file="small.fasta.stats" lines_diff="6" />
<output name="output_repeat_catalog" file="small.fasta.cat" lines_diff="2" />
Expand All @@ -202,27 +219,39 @@
</test>
<test expect_num_outputs="4">
<param name="input_fasta" value="small.fasta" ftype="fasta" />
<param name="source_type" value="dfam" />
<param name="species_list" value="human" />
<conditional name="repeat_source">
<param name="source_type" value="dfam" />
<param name="dfam_lib" value="test" />
<conditional name="species_source">
<param name="species_list" value="Homo sapiens" />
</conditional>
</conditional>
<output name="output_masked_genome" file="small_dfam.fasta.masked" />
<output name="output_table" file="small_dfam.fasta.stats" lines_diff="2" />
<output name="output_repeat_catalog" file="small_dfam.fasta.cat" lines_diff="2" />
<output name="output_log" file="small_dfam.fasta.log" lines_diff="2"/>
</test>
<test expect_num_outputs="4">
<param name="input_fasta" value="small.fasta" ftype="fasta" />
<param name="source_type" value="dfam_up" />
<param name="dfam_lib" value="Dfam_partial_test.h5" ftype="h5" />
<param name="species_name" value="rodent" />
<conditional name="repeat_source">
<param name="source_type" value="dfam_up" />
<param name="dfam_lib" value="Dfam_partial_test.h5" ftype="h5" />
<param name="species_name" value="rodent" />
</conditional>
<output name="output_masked_genome" file="small_dfam_up.fasta.masked" />
<output name="output_table" file="small_dfam_up.fasta.stats" lines_diff="2" />
<output name="output_repeat_catalog" file="small_dfam_up.fasta.cat" lines_diff="2" />
<output name="output_log" file="small_dfam_up.fasta.log" lines_diff="2"/>
</test>
<test expect_num_outputs="4">
<param name="input_fasta" value="small.fasta" ftype="fasta" />
<param name="source_type" value="dfam" />
<param name="species_list" value="rattus" />
<conditional name="repeat_source">
<param name="source_type" value="dfam" />
<param name="dfam_lib" value="test" />
<conditional name="species_source">
<param name="species_list" value="rodent" />
</conditional>
</conditional>
<output name="output_masked_genome" file="small_dfam_rattus.fasta.masked" />
<output name="output_table" file="small_dfam_rattus.fasta.stats" lines_diff="2" />
<output name="output_repeat_catalog" file="small_dfam_rattus.fasta.cat" lines_diff="2" />
Expand Down
4 changes: 2 additions & 2 deletions tools/repeatmasker/test-data/small.fasta.cat
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,6 @@ Gap_init rate = 0.03 (1 / 35), avg. gap size = 1.00 (1 / 1)
## Total Length: 14220
## Total NonMask ( excluding >20bp runs of N/X bases ): 14220
## Total NonSub ( excluding all non ACGT bases ):14220
RepeatMasker version 4.1.5 , default mode
run with rmblastn version 2.13.0+
RepeatMasker version 4.1.7-p1 , default mode
run with rmblastn version 2.14.1+
RM Library:
8 changes: 4 additions & 4 deletions tools/repeatmasker/test-data/small.fasta.stats
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ Low complexity: 0 0 bp 0.00 %
Runs of >=20 X/Ns in query were excluded in % calcs


RepeatMasker version 4.1.5 , default mode
run with rmblastn version 2.13.0+
The query was compared to unclassified sequences in ".../dataset_9e3ddbd2-0776-4c6d-bed6-0f4cd415796c.dat"
RepeatMasker version 4.1.7-p1 , default mode

run with rmblastn version 2.14.1+
The query was compared to unclassified sequences in ".../dataset_37f353be-1b0e-4054-b3c1-b0b207477de3.dat"
FamDB:
22 changes: 3 additions & 19 deletions tools/repeatmasker/test-data/small_dfam.fasta.cat
Original file line number Diff line number Diff line change
Expand Up @@ -70,22 +70,6 @@ Matrix = Unknown
Transitions / transversions = 0.86 (6/7)
Gap_init rate = 0.03 (2 / 58), avg. gap size = 1.00 (2 / 2)

180 25.44 1.41 5.88 scaffold_1 8140 8210 (5941) C AmnL2-1#LINE/L2 (11) 2602 2535 m_b1s601i0

scaffold_1 8140 ACAACATTATTTTGTCTA-CACCCTGCATACAGCACAGTATATTAAATTT 8188
v v - ii i v i- v vii ---
C AmnL2-1#LINE/ 2602 ACAACTTTATTTTGTATAGCGTCTTTCATACAA-ACTGTATCCCAAA--- 2557

scaffold_1 8189 AGGTTTTATTAAGTTAAGTAAT 8210
v i ivi i
C AmnL2-1#LINE/ 2556 ACGCTTTACAGAGTTAAATAAT 2535

Matrix = 25p39g.matrix
Kimura (with divCpGMod) = 29.45
CpG sites = 10, Kimura (unadjusted) = 31.65
Transitions / transversions = 1.43 (10/7)
Gap_init rate = 0.07 (5 / 70), avg. gap size = 1.00 (5 / 5)

67 2.94 1.43 0.00 scaffold_1 11981 12050 (2170) (CT)n#Simple_repeat 1 71 (0) c_b1s251i0

scaffold_1 11981 CTCTCTCTCTCTCCCTCTCCCTCTC-CTCTCTCTCTCTCTCTCTCTCTCT 12029
Expand Down Expand Up @@ -114,6 +98,6 @@ Gap_init rate = 0.03 (1 / 35), avg. gap size = 1.00 (1 / 1)
## Total Length: 14220
## Total NonMask ( excluding >20bp runs of N/X bases ): 14220
## Total NonSub ( excluding all non ACGT bases ):14220
RepeatMasker version 4.1.5 , default mode
run with rmblastn version 2.13.0+
RM Library: CONS-Dfam_3.7
RepeatMasker version 4.1.7-p1 , default mode
run with rmblastn version 2.14.1+
RM Library: CONS-Dfam_3.8
5 changes: 2 additions & 3 deletions tools/repeatmasker/test-data/small_dfam.fasta.log
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,5 @@ SW score % div. % del. % ins. query sequence pos in query: begin end (left) rep
15 18.4 10.2 0.0 scaffold_1 4853 4901 (9319) (TC)n Simple_repeat 1 54 (0) 4
13 19.1 1.8 7.7 scaffold_1 6230 6284 (7936) (TAATTAA)n Simple_repeat 1 52 (0) 5
15 28.3 0.0 3.5 scaffold_1 6548 6606 (7614) (GACA)n Simple_repeat 1 57 (0) 6
180 25.4 1.4 5.9 scaffold_1 8140 8210 (6010) C AmnL2-1 LINE/L2 (11) 2602 2535 7
67 2.9 1.4 0.0 scaffold_1 11981 12050 (2170) (CT)n Simple_repeat 1 71 (0) 8
19 15.4 2.8 0.0 scaffold_1 12078 12113 (2107) (CT)n Simple_repeat 1 37 (0) 8
67 2.9 1.4 0.0 scaffold_1 11981 12050 (2170) (CT)n Simple_repeat 1 71 (0) 7
19 15.4 2.8 0.0 scaffold_1 12078 12113 (2107) (CT)n Simple_repeat 1 37 (0) 7
6 changes: 3 additions & 3 deletions tools/repeatmasker/test-data/small_dfam.fasta.masked
Original file line number Diff line number Diff line change
Expand Up @@ -161,9 +161,9 @@ TGTACACTTACTTCTATGGAAAAGATGGAGCGCCACAGTGAAAACTGTTT
TGAGTCTGTGAGGGGAAAACACAGCATCAGTCACAGTGAAACACTAGGTG
GCACTCAGGTTTGACATTCAAGCATTTGTATCCCACAGTTACTGTTGCTG
GGTTGTTGGCTGGCATGCAACTTAATATGATCTATCTTTAAATCAGTGTG
TGCAGTGGTTATTTAGTTTAAGTGCTTTTTAATGATGTCNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNGTTCTGAGGTGGCATTGCCCTCAGGTATATATCCCTCAGG
TGCAGTGGTTATTTAGTTTAAGTGCTTTTTAATGATGTCACAACATTATT
TTGTCTACACCCTGCATACAGCACAGTATATTAAATTTAGGTTTTATTAA
GTTAAGTAATGTTCTGAGGTGGCATTGCCCTCAGGTATATATCCCTCAGG
CAGTGTTACTGGACAGCATATAGATTGTAATGTTGTGTAAGCAGTGTTGT
GTAAGCTTTTTTAACCAAAATGCTCTCATGTTTCTTTGTTACCACAGTGG
TTTTAGTGATGTTTTGTGCTGTGAACAGAATCATGATTTCTGCAGACACT
Expand Down
18 changes: 9 additions & 9 deletions tools/repeatmasker/test-data/small_dfam.fasta.stats
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ file name: rm_input.fasta
sequences: 1
total length: 14220 bp (14220 bp excl N/X-runs)
GC level: 39.94 %
bases masked: 449 bp ( 3.16 %)
bases masked: 378 bp ( 2.66 %)
==================================================
number of length percentage
elements* occupied of sequence
Expand All @@ -12,9 +12,9 @@ SINEs: 0 0 bp 0.00 %
ALUs 0 0 bp 0.00 %
MIRs 0 0 bp 0.00 %

LINEs: 1 71 bp 0.50 %
LINEs: 0 0 bp 0.00 %
LINE1 0 0 bp 0.00 %
LINE2 1 71 bp 0.50 %
LINE2 0 0 bp 0.00 %
L3/CR1 0 0 bp 0.00 %

LTR elements: 0 0 bp 0.00 %
Expand All @@ -29,7 +29,7 @@ DNA elements: 0 0 bp 0.00 %

Unclassified: 0 0 bp 0.00 %

Total interspersed repeats: 71 bp 0.50 %
Total interspersed repeats: 0 bp 0.00 %


Small RNA: 0 0 bp 0.00 %
Expand All @@ -44,8 +44,8 @@ Low complexity: 0 0 bp 0.00 %
Runs of >=20 X/Ns in query were excluded in % calcs


The query species was assumed to be human
RepeatMasker version 4.1.5 , default mode
run with rmblastn version 2.13.0+
FamDB: CONS-Dfam_3.7
The query species was assumed to be Homo sapiens
RepeatMasker version 4.1.7-p1 , default mode

run with rmblastn version 2.14.1+
FamDB: CONS-Dfam_3.8
6 changes: 3 additions & 3 deletions tools/repeatmasker/test-data/small_dfam_rattus.fasta.cat
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,6 @@ Gap_init rate = 0.03 (1 / 35), avg. gap size = 1.00 (1 / 1)
## Total Length: 14220
## Total NonMask ( excluding >20bp runs of N/X bases ): 14220
## Total NonSub ( excluding all non ACGT bases ):14220
RepeatMasker version 4.1.5 , default mode
run with rmblastn version 2.13.0+
RM Library: CONS-Dfam_3.7
RepeatMasker version 4.1.7-p1 , default mode
run with rmblastn version 2.14.1+
RM Library: CONS-Dfam_3.8
10 changes: 5 additions & 5 deletions tools/repeatmasker/test-data/small_dfam_rattus.fasta.stats
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ Low complexity: 0 0 bp 0.00 %
Runs of >=20 X/Ns in query were excluded in % calcs


The query species was assumed to be rattus
RepeatMasker version 4.1.5 , default mode
run with rmblastn version 2.13.0+
FamDB: CONS-Dfam_3.7
The query species was assumed to be rodent
RepeatMasker version 4.1.7-p1 , default mode

run with rmblastn version 2.14.1+
FamDB: CONS-Dfam_3.8
6 changes: 3 additions & 3 deletions tools/repeatmasker/test-data/small_dfam_up.fasta.cat
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,6 @@ Gap_init rate = 0.03 (1 / 35), avg. gap size = 1.00 (1 / 1)
## Total Length: 14220
## Total NonMask ( excluding >20bp runs of N/X bases ): 14220
## Total NonSub ( excluding all non ACGT bases ):14220
RepeatMasker version 4.1.5 , default mode
run with rmblastn version 2.13.0+
RM Library: CONS-Dfam_1.0
RepeatMasker version 4.1.7-p1 , default mode
run with rmblastn version 2.14.1+
RM Library: CONS-Dfam_3.8
8 changes: 4 additions & 4 deletions tools/repeatmasker/test-data/small_dfam_up.fasta.stats
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ Low complexity: 0 0 bp 0.00 %


The query species was assumed to be rodent
RepeatMasker version 4.1.5 , default mode
run with rmblastn version 2.13.0+
FamDB: CONS-Dfam_1.0
RepeatMasker version 4.1.7-p1 , default mode

run with rmblastn version 2.14.1+
FamDB: CONS-Dfam_3.8
Loading