Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

adapter tutorials to new framework #357

Merged
merged 4 commits into from
Nov 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions docs/_tutorials/data/genomic_region_generator_ncbi.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#######################
### BASIC PARAMETERS ###
#######################

### General parameters
dir_output: output_genomic_region_generator_ncbi # name of the directory where the output files will be written

### Parameters for genome and gene annotation
source: ncbi # required: indicate that ncbi annotation should be used
source_params:
taxon: vertebrate_mammalian # required: taxon of the species, valid taxa are: archaea, bacteria, fungi, invertebrate, mitochondrion, plant, plasmid, plastid, protozoa, vertebrate_mammalian, vertebrate_other, viral
species: Homo_sapiens # required: species name in NCBI download format, e.g. 'Homo_sapiens' for human; see https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ for available species name
annotation_release: 110 # required: release number of annotation e.g. '109' or '109.20211119' or 'current' to use most recent annotation release. Check out release numbers for NCBI at ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/

### Parameters for sequences generation
# List of genomic regions that should be generated, set the genomic regions you want to generate to True
genomic_regions:
gene: false
exon: true
exon_exon_junction: true
cds: false
intron: false

# If exon_exon_junction is ste to true, specify the block size, i.e. +/- "block_size" bp around the junction
# Hint: it does not make sense to set the block size larger than the maximum oligo length
exon_exon_junction_block_size: 50
165 changes: 165 additions & 0 deletions docs/_tutorials/data/scrinshot_probe_designer.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
#######################
### USER PARAMETERS ###
#######################

### General parameters
### -----------------------------------------------
n_jobs: 4 # number of cores used to run the pipeline and 2*n_jobs +1 of regions that should be stored in cache. If memory consumption of pipeline is too high reduce this number, if a lot of RAM is available increase this number to decrease runtime
dir_output: output_scrinshot_probe_designer # name of the directory where the output files will be written
write_intermediate_steps: true # if true, writes the oligo sequences after each step of the pipeline into a csv file

### Parameters for probe sequences generation
### -----------------------------------------------
file_regions: my_genes.txt # file with a list the genes used to generate the oligos sequences, leave empty if all the genes are used
files_fasta_probe_database: # fasta file with sequences form which the oligos should be generated. Hint: use the genomic_region_generator pipeline to create fasta files of genomic regions of interest
- output_genomic_region_generator_ncbi/annotation/exon_annotation_source-NCBI_species-Homo_sapiens_annotation_release-110_genome_assemly-GRCh38.p14.fna
- output_genomic_region_generator_ncbi/annotation/exon_exon_junction_annotation_source-NCBI_species-Homo_sapiens_annotation_release-110_genome_assemly-GRCh38.p14.fna
probe_length_min: 40 #min length of oligos
probe_length_max: 45 #max length of oligos

### Parameters for the property filers, i.e. properties that the sequences should fulfill
### -----------------------------------------------
## target probe sequence
probe_GC_content_min: 40 # minimum GC content of oligos
probe_GC_content_max: 60 # maximum GC content of oligos
probe_Tm_min: 65 # minimum melting temperature of oligos
probe_Tm_max: 75 # maximum melting temperature of oligos
homopolymeric_base_n: # minimum number of nucleotides to consider it a homopolymeric run per base
A: 5
T: 5
C: 5
G: 5
## padlock arms
arm_Tm_dif_max: 2 # maximum melting temperature difference of both arms (difference shouldn't be higher than 5! But range is not super important, the lower the better)
arm_length_min: 10 # minimum length of each arm
arm_Tm_min: 50 # minimum melting temperature of each arm
arm_Tm_max: 60 # maximum melting temperature of each arm
## detection oligos
min_thymines: 2 # minimal number of Thymines in detection oligo.
detect_oligo_length_min: 15 # minimum length of detection probe
detect_oligo_length_max: 40 # maximum length of detection probe

### Parameters for the specificity filters
### -----------------------------------------------
files_fasta_reference_database: # fasta file with sequences used as reference for the specificity filters. Hint: use the genomic_region_generator pipeline to create fasta files of genomic regions of interest
- output_genomic_region_generator_ncbi/annotation/exon_annotation_source-NCBI_species-Homo_sapiens_annotation_release-110_genome_assemly-GRCh38.p14.fna
- output_genomic_region_generator_ncbi/annotation/exon_exon_junction_annotation_source-NCBI_species-Homo_sapiens_annotation_release-110_genome_assemly-GRCh38.p14.fna
ligation_region_size: 5 # size of the seed region around the ligation site for blast seed region filter; set to 0 if ligation region should not be considered for blast search

### Parameters for set selection
### -----------------------------------------------
probe_isoform_weight: 2 # weight of the isoform consensus of the probe in the efficiency score
probe_GC_content_opt: 50 # max and min values are defiend above
probe_GC_weight: 1 # weight of the GC content of the probe in the efficiency score
probe_Tm_opt: 70 # max and min values are defiend above
probe_Tm_weight: 1 # weight of the Tm of the probe in the efficiency score

probeset_size_min: 3 # minimum size of probe sets (in case there exist no set of the optimal size) -> genes with less oligos will be filtered out and stored in regions_with_insufficient_oligos_for_db_probes
probeset_size_opt: 5 # optimal size of probe sets
distance_between_probes: 0 # how much overlap should be allowed between oligos, e.g. if oligos can overlpap x bases choose -x, if oligos can be next to one another choose 0, if oligos should be x bases apart choose x
n_sets: 100 # maximum number of sets to generate

### Parameters for final sequence design
### -----------------------------------------------
U_distance: 5 # preferred minimal distance between U(racils)
detect_oligo_Tm_opt: 56 # optimal melting temperature of detection probe
top_n_sets: 3 #maximum number of sets to report in padlock_probes.yaml and "padlock_probes_order.yaml"

############################
### DEVELOPER PARAMETERS ###
############################

### Parameters for the specificity filters
### -----------------------------------------------
# Specificity filter with BlastN
specificity_blastn_search_parameters:
perc_identity: 80
strand: "minus" # this parameter is fixed, if reference is whole genome, consider using "both"
word_size: 10
dust: "no"
soft_masking: "false"
max_target_seqs: 10
max_hsps: 1000
specificity_blastn_hit_parameters:
coverage: 50 # can be turned into min_alignment_length

# Crosshybridization filter with BlastN
cross_hybridization_blastn_search_parameters:
perc_identity: 80
strand: "minus" # this parameter is fixed
word_size: 10
dust: "no"
soft_masking: "false"
max_target_seqs: 10
cross_hybridization_blastn_hit_parameters:
coverage: 80 # can be turned into min_alignment_length


### Parameters for the Oligo set selection
### -----------------------------------------------
max_graph_size: 5000 # maximum number of oligos that are taken into consisderation in the last step (5000 -> ~5GB, 2500 -> ~1GB)


### Parameters for Melting Temperature
### -----------------------------------------------
# The melting temperature is used in 2 different stages (property filters and padlock detection probe design), where a few parameters are shared and the others differ.
# parameters for melting temperature -> for more information on parameters, see: https://biopython.org/docs/1.75/api/Bio.SeqUtils.MeltingTemp.html#Bio.SeqUtils.MeltingTemp.Tm_NN
## target probe
Tm_parameters_probe:
check: true #default
strict: true #default
c_seq: null #default
shift: 0 #default
nn_table: DNA_NN3 # Allawi & SantaLucia (1997)
tmm_table: DNA_TMM1 #default
imm_table: DNA_IMM1 #default
de_table: DNA_DE1 #default
dnac1: 50 #[nM]
dnac2: 0 #[nM]
selfcomp: false #default
saltcorr: 7 # Owczarzy et al. (2008)
Na: 39 #[mM]
K: 75 #[mM]
Tris: 20 #[mM]
Mg: 10 #[mM]
dNTPs: 0 #[mM] default

Tm_chem_correction_param_probe:
DMSO: 0 #default
fmd: 20
DMSOfactor: 0.75 #default
fmdfactor: 0.65 #default
fmdmethod: 1 #default
GC: null #default

Tm_salt_correction_param_probe: null # if salt correction desired, please add parameters below

## detection oligo
Tm_parameters_detection_oligo:
check: true #default
strict: true #default
c_seq: null #default
shift: 0 #default
nn_table: DNA_NN3 # Allawi & SantaLucia (1997)
tmm_table: DNA_TMM1 #default
imm_table: DNA_IMM1 #default
de_table: DNA_DE1 #default
dnac1: 50 #[nM]
dnac2: 0 #[nM]
selfcomp: false #default
saltcorr: 7 # Owczarzy et al. (2008)
Na: 39 #[mM]
K: 0 #[mM] default
Tris: 0 #[mM] default
Mg: 0 #[mM] default
dNTPs: 0 #[mM] default

Tm_chem_correction_param_detection_oligo:
DMSO: 0 #default
fmd: 30
DMSOfactor: 0.75 #default
fmdfactor: 0.65 #default
fmdmethod: 1 #default
GC: null #default

Tm_salt_correction_param_detection_oligo: null # if salt correction desired, please add parameters below
Loading
Loading