diff --git a/.gitignore b/.gitignore index f58380d..bbc9a19 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,8 @@ stats.json log/* !log/placeholder_for_sbatch_output deployed/* +freezed + .vscode/* .DS_Store diff --git a/defaults/parameters.yaml b/defaults/parameters.yaml index f6f9942..2310cdc 100644 --- a/defaults/parameters.yaml +++ b/defaults/parameters.yaml @@ -11,12 +11,28 @@ files: annotation: "defaults/annotation.gff" include: "defaults/include.txt" color_schemes: "defaults/color_schemes.tsv" - clades: "defaults/clades.tsv" - ordering: "defaults/color_ordering.tsv" - lat_longs: "defaults/lat_longs.tsv" auspice_config: "defaults/auspice_config.json" description: "defaults/description.md" - mut_fit: "defaults/mutational_fitness_distance_map.json" + clades: "builds/clades.tsv" + ordering: "builds/color_ordering.tsv" + lat_longs: "builds/lat_longs.tsv" + mut_fit: "builds/mutational_fitness_distance_map.json" + pango_designations: "builds/pango_designations.csv" + metadata_designated: "builds/metadata_designated.tsv" + +data_source: + clades: "https://raw.githubusercontent.com/nextstrain/ncov/master/defaults/clades.tsv" + lat_longs: "https://raw.githubusercontent.com/nextstrain/ncov/master/defaults/lat_longs.tsv" + color_ordering: "https://raw.githubusercontent.com/nextstrain/ncov/master/defaults/color_ordering.tsv" + mut_fit: "https://raw.githubusercontent.com/nextstrain/ncov/master/defaults/mutational_fitness_distance_map.json" + pango_designations: "https://raw.githubusercontent.com/cov-lineages/pango-designation/master/lineages.csv" + +origins: + gisaid: + metadata: "s3://nextstrain-ncov-private/metadata.tsv.gz" + sequences: "s3://nextstrain-ncov-private/sequences.fasta.xz" + exclude: "https://raw.githubusercontent.com/nextstrain/ncov/master/defaults/exclude.txt" + filters: "--min-length 27000 --min-date 2019-12-01" tree: tree-builder-args: "'-ninit 10 -n 4 -czb'" @@ -84,3 +100,13 @@ traits: sampling_bias_correction: 2.5 columns: ["country"] +distances: + comparisons: ['root', 'root', 'root', 'root', 'root', 'root'] + attributes: ['S1_mutations', 'DMS_convalescent_serum', 'DMS_Class_1', 'DMS_Class_2', 'DMS_Class_3', 'ACE2_binding_site_mutations'] + maps: + - "defaults/distance_maps/S1.json" + - "defaults/distance_maps/convalescent_serum_mean_dms.json" + - "defaults/distance_maps/class_1_mean_dms.json" + - "defaults/distance_maps/class_2_mean_dms.json" + - "defaults/distance_maps/class_3_mean_dms.json" + - "defaults/distance_maps/ace2.json" \ No newline at end of file diff --git a/profiles/basel-combined/builds.yaml b/profiles/basel-combined/builds.yaml index bdc7b6b..399d454 100644 --- a/profiles/basel-combined/builds.yaml +++ b/profiles/basel-combined/builds.yaml @@ -8,19 +8,15 @@ files: ordering: "builds/color_ordering.tsv" lat_longs: "builds/lat_longs.tsv" mut_fit: "builds/mutational_fitness_distance_map.json" + pango_designations: "builds/pango_designations.csv" + metadata_designated: "builds/metadata_designated.tsv" data_source: clades: "https://raw.githubusercontent.com/nextstrain/ncov/master/defaults/clades.tsv" lat_longs: "https://raw.githubusercontent.com/nextstrain/ncov/master/defaults/lat_longs.tsv" color_ordering: "https://raw.githubusercontent.com/nextstrain/ncov/master/defaults/color_ordering.tsv" mut_fit: "https://raw.githubusercontent.com/nextstrain/ncov/master/defaults/mutational_fitness_distance_map.json" - -origins: - gisaid: - metadata: "s3://nextstrain-ncov-private/metadata.tsv.gz" - sequences: "s3://nextstrain-ncov-private/sequences.fasta.xz" - exclude: "https://raw.githubusercontent.com/nextstrain/ncov/master/defaults/exclude.txt" - filters: "--min-length 27000 --min-date 2019-12-01" + pango_designations: "https://raw.githubusercontent.com/cov-lineages/pango-designation/master/lineages.csv" build_dir: "builds-combined" auspice_dir: "auspice-combined" diff --git a/profiles/basel-countries/auspice_config.json b/profiles/basel-countries/auspice_config.json index ceeb579..8959dc0 100644 --- a/profiles/basel-countries/auspice_config.json +++ b/profiles/basel-countries/auspice_config.json @@ -1,135 +1,135 @@ -{ - "title": "Genomic epidemiology of SARS-CoV-2 in Europe", - "build_url": "https://github.com/neherlab/ncov-simple", - "maintainers": [ - { "name": "Cornelius Roemer", "url": "https://neherlab.org" }, - { "name": "Richard Neher", "url": "https://neherlab.org" } - ], - "data_provenance": [ - { - "name": "GISAID" - } - ], - "colorings": [ - { - "key": "country", - "title": "Country", - "type": "categorical" - }, - { - "key": "division", - "title": "Admin Division", - "type": "categorical" - }, - { - "key": "pango_lineage", - "title": "PANGO Lineage by GISAID", - "type": "categorical" - }, - { - "key": "pango_default", - "title": "PANGO Lineage by Pangolin", - "type": "categorical" - }, - { - "key": "pango_usher", - "title": "PANGO Lineage by Usher", - "type": "categorical" - }, - { - "key": "S1_mutations", - "title": "S1 mutations", - "type": "continuous" - }, - { - "key": "GISAID_clade", - "title": "GISAID Clade", - "type": "categorical" - }, - { - "key": "subclade_membership", - "title": "Emerging clade", - "type": "categorical" - }, - { - "key": "region", - "title": "Region", - "type": "categorical" - }, - { - "key": "host", - "title": "Host", - "type": "categorical" - }, - { - "key": "age", - "title": "Age", - "type": "continuous" - }, - { - "key": "sex", - "title": "Sex", - "type": "categorical" - }, - { - "key": "author", - "title": "Authors", - "type": "categorical" - }, - { - "key": "originating_lab", - "title": "Originating Lab", - "type": "categorical" - }, - { - "key": "submitting_lab", - "title": "Submitting Lab", - "type": "categorical" - }, - { - "key": "recency", - "title": "Submission Date", - "type": "categorical" - }, - { - "key": "country_exposure", - "title": "Country of exposure", - "type": "categorical" - }, - { - "key": "division_exposure", - "title": "Division of exposure", - "type": "categorical" - }, - { - "key": "region_exposure", - "title": "Region of exposure", - "type": "categorical" - } - ], - "geo_resolutions": ["location", "division", "country", "region"], - "display_defaults": { - "color_by": "clade_membership", - "distance_measure": "num_date", - "geo_resolution": "country", - "map_triplicate": true, - "branch_label": "clade", - "transmission_lines": false - }, - "filters": [ - "recency", - "region", - "country", - "division", - "location", - "host", - "S1_mutations", - "pango_lineage", - "pango_default", - "pango_usher", - "clade_membership", - "emerging_lineage", - "author" - ], - "panels": ["tree", "map", "entropy", "frequencies"] -} +{ + "title": "Genomic epidemiology of SARS-CoV-2 in Europe", + "build_url": "https://github.com/neherlab/ncov-simple", + "maintainers": [ + { "name": "Cornelius Roemer", "url": "https://neherlab.org" }, + { "name": "Richard Neher", "url": "https://neherlab.org" } + ], + "data_provenance": [ + { + "name": "GISAID" + } + ], + "colorings": [ + { + "key": "country", + "title": "Country", + "type": "categorical" + }, + { + "key": "division", + "title": "Admin Division", + "type": "categorical" + }, + { + "key": "pango_lineage", + "title": "PANGO Lineage by GISAID", + "type": "categorical" + }, + { + "key": "pango_default", + "title": "PANGO Lineage by Pangolin", + "type": "categorical" + }, + { + "key": "pango_usher", + "title": "PANGO Lineage by Usher", + "type": "categorical" + }, + { + "key": "S1_mutations", + "title": "S1 mutations", + "type": "continuous" + }, + { + "key": "GISAID_clade", + "title": "GISAID Clade", + "type": "categorical" + }, + { + "key": "subclade_membership", + "title": "Emerging clade", + "type": "categorical" + }, + { + "key": "region", + "title": "Region", + "type": "categorical" + }, + { + "key": "host", + "title": "Host", + "type": "categorical" + }, + { + "key": "age", + "title": "Age", + "type": "continuous" + }, + { + "key": "sex", + "title": "Sex", + "type": "categorical" + }, + { + "key": "author", + "title": "Authors", + "type": "categorical" + }, + { + "key": "originating_lab", + "title": "Originating Lab", + "type": "categorical" + }, + { + "key": "submitting_lab", + "title": "Submitting Lab", + "type": "categorical" + }, + { + "key": "recency", + "title": "Submission Date", + "type": "categorical" + }, + { + "key": "country_exposure", + "title": "Country of exposure", + "type": "categorical" + }, + { + "key": "division_exposure", + "title": "Division of exposure", + "type": "categorical" + }, + { + "key": "region_exposure", + "title": "Region of exposure", + "type": "categorical" + } + ], + "geo_resolutions": ["location", "division", "country", "region"], + "display_defaults": { + "color_by": "clade_membership", + "distance_measure": "num_date", + "geo_resolution": "country", + "map_triplicate": true, + "branch_label": "clade", + "transmission_lines": false + }, + "filters": [ + "recency", + "region", + "country", + "division", + "location", + "host", + "S1_mutations", + "pango_lineage", + "pango_default", + "pango_usher", + "clade_membership", + "emerging_lineage", + "author" + ], + "panels": ["tree", "map", "entropy", "frequencies"] +} diff --git a/profiles/basel-swiss/auspice_config.json b/profiles/basel-swiss/auspice_config.json index 6da6838..a65e9d2 100644 --- a/profiles/basel-swiss/auspice_config.json +++ b/profiles/basel-swiss/auspice_config.json @@ -1,124 +1,124 @@ -{ - "title": "Genomic epidemiology of novel coronavirus in Switzerland", - "build_url": "https://github.com/neherlab/ncov-simple", - "maintainers": [ - { "name": "Cornelius Roemer", "url": "https://neherlab.org" }, - { "name": "Richard Neher", "url": "https://neherlab.org" }, - { "name": "Sarah Nadeau", "url": "https://bsse.ethz.ch/cevo" }, - { "name": "Tanja Stadler", "url": "https://bsse.ethz.ch/cevo" } - ], - "data_provenance": [ - { - "name": "GISAID" - } - ], - "colorings": [ - { - "key": "location", - "title": "Location", - "type": "categorical" - }, - { - "key": "division", - "title": "Admin Division", - "type": "categorical" - }, - { - "key": "country", - "title": "Country", - "type": "categorical" - }, - { - "key": "region", - "title": "Region", - "type": "categorical" - }, - { - "key": "S1_mutations", - "title": "S1 mutations", - "type": "continuous" - }, - { - "key": "pango_lineage", - "title": "PANGO Lineage", - "type": "categorical" - }, - { - "key": "subclade_membership", - "title": "Emerging clade", - "type": "categorical" - }, - { - "key": "host", - "title": "Host", - "type": "categorical" - }, - { - "key": "age", - "title": "Age", - "type": "continuous" - }, - { - "key": "sex", - "title": "Sex", - "type": "categorical" - }, - { - "key": "author", - "title": "Authors", - "type": "categorical" - }, - { - "key": "originating_lab", - "title": "Originating Lab", - "type": "categorical" - }, - { - "key": "submitting_lab", - "title": "Submitting Lab", - "type": "categorical" - }, - { - "key": "recency", - "title": "Submission Date", - "type": "categorical" - }, - { - "key": "country_exposure", - "title": "Country of exposure", - "type": "categorical" - }, - { - "key": "division_exposure", - "title": "Division of exposure", - "type": "categorical" - }, - { - "key": "region_exposure", - "title": "Region of exposure", - "type": "categorical" - } - ], - "geo_resolutions": ["location", "division", "country", "region"], - "display_defaults": { - "color_by": "clade_membership", - "distance_measure": "num_date", - "geo_resolution": "division", - "map_triplicate": true, - "branch_label": "clade", - "transmission_lines": false - }, - "filters": [ - "country", - "region", - "recency", - "division", - "location", - "host", - "pango_lineage", - "clade_membership", - "emerging_lineage", - "author" - ], - "panels": ["tree", "map", "entropy", "frequencies"] -} +{ + "title": "Genomic epidemiology of novel coronavirus in Switzerland", + "build_url": "https://github.com/neherlab/ncov-simple", + "maintainers": [ + { "name": "Cornelius Roemer", "url": "https://neherlab.org" }, + { "name": "Richard Neher", "url": "https://neherlab.org" }, + { "name": "Sarah Nadeau", "url": "https://bsse.ethz.ch/cevo" }, + { "name": "Tanja Stadler", "url": "https://bsse.ethz.ch/cevo" } + ], + "data_provenance": [ + { + "name": "GISAID" + } + ], + "colorings": [ + { + "key": "location", + "title": "Location", + "type": "categorical" + }, + { + "key": "division", + "title": "Admin Division", + "type": "categorical" + }, + { + "key": "country", + "title": "Country", + "type": "categorical" + }, + { + "key": "region", + "title": "Region", + "type": "categorical" + }, + { + "key": "S1_mutations", + "title": "S1 mutations", + "type": "continuous" + }, + { + "key": "pango_lineage", + "title": "PANGO Lineage", + "type": "categorical" + }, + { + "key": "subclade_membership", + "title": "Emerging clade", + "type": "categorical" + }, + { + "key": "host", + "title": "Host", + "type": "categorical" + }, + { + "key": "age", + "title": "Age", + "type": "continuous" + }, + { + "key": "sex", + "title": "Sex", + "type": "categorical" + }, + { + "key": "author", + "title": "Authors", + "type": "categorical" + }, + { + "key": "originating_lab", + "title": "Originating Lab", + "type": "categorical" + }, + { + "key": "submitting_lab", + "title": "Submitting Lab", + "type": "categorical" + }, + { + "key": "recency", + "title": "Submission Date", + "type": "categorical" + }, + { + "key": "country_exposure", + "title": "Country of exposure", + "type": "categorical" + }, + { + "key": "division_exposure", + "title": "Division of exposure", + "type": "categorical" + }, + { + "key": "region_exposure", + "title": "Region of exposure", + "type": "categorical" + } + ], + "geo_resolutions": ["location", "division", "country", "region"], + "display_defaults": { + "color_by": "clade_membership", + "distance_measure": "num_date", + "geo_resolution": "division", + "map_triplicate": true, + "branch_label": "clade", + "transmission_lines": false + }, + "filters": [ + "country", + "region", + "recency", + "division", + "location", + "host", + "pango_lineage", + "clade_membership", + "emerging_lineage", + "author" + ], + "panels": ["tree", "map", "entropy", "frequencies"] +} diff --git a/profiles/clades/auspice_config.json b/profiles/clades/auspice_config.json deleted file mode 100644 index fa6a530..0000000 --- a/profiles/clades/auspice_config.json +++ /dev/null @@ -1,67 +0,0 @@ -{ - "title": "Your samples placed on a SARS-CoV-2 phylogeny", - "build_url": "https://github.com/nextstrain/nextclade", - "maintainers": [ - {"name": "Ivan Aksamentov", "url": "https://neherlab.org"}, - {"name": "Richard Neher", "url": "https://neherlab.org"} - ], - "colorings": [ - { - "key": "region", - "title": "Region", - "type": "categorical" - }, - { - "key": "country", - "title": "Country", - "type": "categorical" - }, - { - "key": "division", - "title": "Admin Division", - "type": "categorical" - }, - { - "key": "pango_lineage", - "title": "Pango Lineage", - "type": "categorical" - }, - { - "key": "GISAID_clade", - "title": "GISAID Clade", - "type": "categorical" - }, - { - "key": "subclade_membership", - "title": "Emerging clade", - "type": "categorical" - }, - { - "key": "region_exposure", - "title": "Region of exposure", - "type": "categorical" - } - ], - "geo_resolutions": [ - "region" - ], - "display_defaults": { - "color_by": "clade_membership", - "distance_measure": "div", - "geo_resolution": "region", - "map_triplicate": true, - "branch_label": "clade", - "transmission_lines": false - }, - "filters": [ - "region", - "country", - "division", - "new_node", - "pango_lineage", - "clade_membership" - ], - "panels": [ - "tree" - ] - } diff --git a/profiles/clades/builds.yaml b/profiles/clades/builds.yaml deleted file mode 100644 index 8952a0c..0000000 --- a/profiles/clades/builds.yaml +++ /dev/null @@ -1,48 +0,0 @@ -title: 'Your samples placed on a SARS-CoV-2 phylogeny' - -files: - reference: "defaults/reference_seq.gb" - alignment_reference: "defaults/reference_seq.fasta" - annotation: "defaults/annotation.gff" - include: "defaults/include.txt" - color_schemes: "defaults/color_schemes.tsv" - clades: "defaults/clades.tsv" - ordering: "defaults/color_ordering.tsv" - lat_longs: "defaults/lat_longs.tsv" - auspice_config: "profiles/clades/auspice_config.json" - description: "profiles/clades/description.md" - -data_source: - clades: "https://raw.githubusercontent.com/nextstrain/ncov/master/defaults/clades.tsv" - -origins: - gisaid: - metadata: "s3://nextstrain-ncov-private/metadata.tsv.gz" - sequences: "s3://nextstrain-ncov-private/sequences.fasta.gz" - exclude: "https://raw.githubusercontent.com/nextstrain/ncov/master/defaults/exclude.txt" - filters: "--min-length 27000" - -refine: - root: 'Wuhan/Hu-1/2019' - divergence_unit: 'mutations' - clock_filter_iqd: 4 - no_timetree: True - clock_rate: 0.0007 - clock_std_dev: 0.0003 - coalescent: "skyline" - date_inference: "marginal" - clock_filter_iqd: 4 - -builds: - nextclade: - subsamples: - early: - filters: "--min-length 28500 --group-by year month division pango_lineage --subsample-max-sequences 1000 --max-date 2021-03-01 --exclude-where QC_rare_mutations!=good QC_snp_clusters!=good" - late: - filters: "--min-length 28500 --group-by year month division pango_lineage --subsample-max-sequences 3000 --min-date 2021-03-01 --exclude-where 'rare_mutations>15' QC_snp_clusters!=good" - nextclade-2k: - subsamples: - early: - filters: "--min-length 28500 --group-by year month division pango_lineage --subsample-max-sequences 500 --max-date 2021-03-01 --exclude-where QC_rare_mutations!=good QC_snp_clusters!=good" - late: - filters: "--min-length 28500 --group-by year month division pango_lineage --subsample-max-sequences 1500 --min-date 2021-03-01 --exclude-where 'rare_mutations>15' QC_snp_clusters!=good" diff --git a/profiles/early_pandemic/auspice_config.json b/profiles/early_pandemic/auspice_config.json index 16ebe66..ce7294e 100644 --- a/profiles/early_pandemic/auspice_config.json +++ b/profiles/early_pandemic/auspice_config.json @@ -1,99 +1,99 @@ -{ - "title": "Genomic epidemiology of novel coronavirus in Switzerland", - "build_url": "https://github.com/nextstrain/ncov", - "maintainers": [ - { "name": "Emma Hodcroft", "url": "https://neherlab.org" }, - { "name": "Richard Neher", "url": "https://neherlab.org" }, - { "name": "Sarah Nadeau", "url": "https://bsse.ethz.ch/cevo" }, - { "name": "Tanja Stadler", "url": "https://bsse.ethz.ch/cevo" } - ], - "data_provenance": [ - { - "name": "GISAID" - } - ], - "colorings": [ - { - "key": "location", - "title": "Location", - "type": "categorical" - }, - { - "key": "division", - "title": "Admin Division", - "type": "categorical" - }, - { - "key": "country", - "title": "Country", - "type": "categorical" - }, - { - "key": "region", - "title": "Region", - "type": "categorical" - }, - { - "key": "pango_lineage", - "title": "PANGO Lineage", - "type": "categorical" - }, - { - "key": "host", - "title": "Host", - "type": "categorical" - }, - { - "key": "author", - "title": "Authors", - "type": "categorical" - }, - { - "key": "originating_lab", - "title": "Originating Lab", - "type": "categorical" - }, - { - "key": "submitting_lab", - "title": "Submitting Lab", - "type": "categorical" - }, - { - "key": "country_exposure", - "title": "Country of exposure", - "type": "categorical" - }, - { - "key": "division_exposure", - "title": "Division of exposure", - "type": "categorical" - }, - { - "key": "region_exposure", - "title": "Region of exposure", - "type": "categorical" - } - ], - "geo_resolutions": ["location", "division", "country", "region"], - "display_defaults": { - "color_by": "country_exposure", - "distance_measure": "num_date", - "geo_resolution": "division", - "map_triplicate": true, - "branch_label": "clade", - "transmission_lines": false - }, - "filters": [ - "country", - "region", - "recency", - "division", - "location", - "host", - "pango_lineage", - "clade_membership", - "emerging_lineage", - "author" - ], - "panels": ["tree", "map", "entropy", "frequencies"] -} +{ + "title": "Genomic epidemiology of novel coronavirus in Switzerland", + "build_url": "https://github.com/nextstrain/ncov", + "maintainers": [ + { "name": "Emma Hodcroft", "url": "https://neherlab.org" }, + { "name": "Richard Neher", "url": "https://neherlab.org" }, + { "name": "Sarah Nadeau", "url": "https://bsse.ethz.ch/cevo" }, + { "name": "Tanja Stadler", "url": "https://bsse.ethz.ch/cevo" } + ], + "data_provenance": [ + { + "name": "GISAID" + } + ], + "colorings": [ + { + "key": "location", + "title": "Location", + "type": "categorical" + }, + { + "key": "division", + "title": "Admin Division", + "type": "categorical" + }, + { + "key": "country", + "title": "Country", + "type": "categorical" + }, + { + "key": "region", + "title": "Region", + "type": "categorical" + }, + { + "key": "pango_lineage", + "title": "PANGO Lineage", + "type": "categorical" + }, + { + "key": "host", + "title": "Host", + "type": "categorical" + }, + { + "key": "author", + "title": "Authors", + "type": "categorical" + }, + { + "key": "originating_lab", + "title": "Originating Lab", + "type": "categorical" + }, + { + "key": "submitting_lab", + "title": "Submitting Lab", + "type": "categorical" + }, + { + "key": "country_exposure", + "title": "Country of exposure", + "type": "categorical" + }, + { + "key": "division_exposure", + "title": "Division of exposure", + "type": "categorical" + }, + { + "key": "region_exposure", + "title": "Region of exposure", + "type": "categorical" + } + ], + "geo_resolutions": ["location", "division", "country", "region"], + "display_defaults": { + "color_by": "country_exposure", + "distance_measure": "num_date", + "geo_resolution": "division", + "map_triplicate": true, + "branch_label": "clade", + "transmission_lines": false + }, + "filters": [ + "country", + "region", + "recency", + "division", + "location", + "host", + "pango_lineage", + "clade_membership", + "emerging_lineage", + "author" + ], + "panels": ["tree", "map", "entropy", "frequencies"] +} diff --git a/profiles/genbank/auspice_config.json b/profiles/genbank/auspice_config.json index 8fd3d27..7d3e64e 100644 --- a/profiles/genbank/auspice_config.json +++ b/profiles/genbank/auspice_config.json @@ -1,116 +1,116 @@ -{ - "title": "Genomic epidemiology of SARS-CoV-2", - "build_url": "https://github.com/nextstrain/ncov-simple", - "maintainers": [ - { "name": "nextstrain team", "url": "https://nextstrain.org" } - ], - "data_provenance": [ - { - "name": "INSDC" - } - ], - "colorings": [ - { - "key": "country", - "title": "Country", - "type": "categorical" - }, - { - "key": "division", - "title": "Admin Division", - "type": "categorical" - }, - { - "key": "pango_lineage", - "title": "PANGO Lineage", - "type": "categorical" - }, - { - "key": "S1_mutations", - "title": "S1 mutations", - "type": "continuous" - }, - { - "key": "subclade_membership", - "title": "Emerging clade", - "type": "categorical" - }, - { - "key": "region", - "title": "Region", - "type": "categorical" - }, - { - "key": "host", - "title": "Host", - "type": "categorical" - }, - { - "key": "age", - "title": "Age", - "type": "continuous" - }, - { - "key": "sex", - "title": "Sex", - "type": "categorical" - }, - { - "key": "author", - "title": "Authors", - "type": "categorical" - }, - { - "key": "originating_lab", - "title": "Originating Lab", - "type": "categorical" - }, - { - "key": "submitting_lab", - "title": "Submitting Lab", - "type": "categorical" - }, - { - "key": "recency", - "title": "Submission Date", - "type": "categorical" - }, - { - "key": "country_exposure", - "title": "Country of exposure", - "type": "categorical" - }, - { - "key": "division_exposure", - "title": "Division of exposure", - "type": "categorical" - }, - { - "key": "region_exposure", - "title": "Region of exposure", - "type": "categorical" - } - ], - "geo_resolutions": ["location", "division", "country", "region"], - "display_defaults": { - "color_by": "clade_membership", - "distance_measure": "num_date", - "geo_resolution": "country", - "map_triplicate": true, - "branch_label": "clade", - "transmission_lines": false - }, - "filters": [ - "recency", - "region", - "country", - "division", - "location", - "host", - "pango_lineage", - "clade_membership", - "emerging_lineage", - "author" - ], - "panels": ["tree", "map", "entropy", "frequencies"] -} +{ + "title": "Genomic epidemiology of SARS-CoV-2", + "build_url": "https://github.com/nextstrain/ncov-simple", + "maintainers": [ + { "name": "nextstrain team", "url": "https://nextstrain.org" } + ], + "data_provenance": [ + { + "name": "INSDC" + } + ], + "colorings": [ + { + "key": "country", + "title": "Country", + "type": "categorical" + }, + { + "key": "division", + "title": "Admin Division", + "type": "categorical" + }, + { + "key": "pango_lineage", + "title": "PANGO Lineage", + "type": "categorical" + }, + { + "key": "S1_mutations", + "title": "S1 mutations", + "type": "continuous" + }, + { + "key": "subclade_membership", + "title": "Emerging clade", + "type": "categorical" + }, + { + "key": "region", + "title": "Region", + "type": "categorical" + }, + { + "key": "host", + "title": "Host", + "type": "categorical" + }, + { + "key": "age", + "title": "Age", + "type": "continuous" + }, + { + "key": "sex", + "title": "Sex", + "type": "categorical" + }, + { + "key": "author", + "title": "Authors", + "type": "categorical" + }, + { + "key": "originating_lab", + "title": "Originating Lab", + "type": "categorical" + }, + { + "key": "submitting_lab", + "title": "Submitting Lab", + "type": "categorical" + }, + { + "key": "recency", + "title": "Submission Date", + "type": "categorical" + }, + { + "key": "country_exposure", + "title": "Country of exposure", + "type": "categorical" + }, + { + "key": "division_exposure", + "title": "Division of exposure", + "type": "categorical" + }, + { + "key": "region_exposure", + "title": "Region of exposure", + "type": "categorical" + } + ], + "geo_resolutions": ["location", "division", "country", "region"], + "display_defaults": { + "color_by": "clade_membership", + "distance_measure": "num_date", + "geo_resolution": "country", + "map_triplicate": true, + "branch_label": "clade", + "transmission_lines": false + }, + "filters": [ + "recency", + "region", + "country", + "division", + "location", + "host", + "pango_lineage", + "clade_membership", + "emerging_lineage", + "author" + ], + "panels": ["tree", "map", "entropy", "frequencies"] +} diff --git a/profiles/clades/config.yaml b/profiles/pango-cluster/config.yaml similarity index 61% rename from profiles/clades/config.yaml rename to profiles/pango-cluster/config.yaml index 04e4077..b3c8725 100644 --- a/profiles/clades/config.yaml +++ b/profiles/pango-cluster/config.yaml @@ -1,6 +1,7 @@ configfile: - defaults/parameters.yaml # Pull in the default values - - profiles/clades/builds.yaml # Specific builds for this profile + - profiles/pango/builds.yaml # Specific builds for this profile + - profiles/basel-combined/secrets.yaml # Secrets not committed to git # Always print the commands that will be run to the screen for debugging. printshellcmds: True @@ -13,11 +14,14 @@ show-failed-logs: True cluster-config: profiles/cluster/cluster.json -cluster: "sbatch --time={cluster.time} --mem={cluster.mem} --cpus-per-task={cluster.n} --qos={cluster.qos}" +cluster: "python3 profiles/cluster/submit.py" -jobs: 128 +jobs: 512 -jobscript: profiles/cluster/submit.sh +# jobscript: profiles/cluster/submit.sh # Set the name for the job as display in the cluster queue. jobname: "{rulename}.{jobid}.sh" + +# For local rules +cores: 4 diff --git a/profiles/pango/auspice_config.json b/profiles/pango/auspice_config.json new file mode 100644 index 0000000..dac91a3 --- /dev/null +++ b/profiles/pango/auspice_config.json @@ -0,0 +1,140 @@ +{ + "title": "Diversity build using only pango designated sequences", + "build_url": "https://github.com/neherlab/ncov-simple", + "maintainers": [ + { "name": "Cornelius Roemer", "url": "https://neherlab.org" }, + { "name": "Richard Neher", "url": "https://neherlab.org" } + ], + "data_provenance": [ + { + "name": "GISAID" + } + ], + "colorings": [ + { + "key": "country", + "title": "Country", + "type": "categorical" + }, + { + "key": "division", + "title": "Admin Division", + "type": "categorical" + }, + { + "key": "lineage", + "title": "PANGO Lineage by designation", + "type": "categorical" + }, + { + "key": "pango_lineage", + "title": "PANGO Lineage by GISAID", + "type": "categorical" + }, + { + "key": "pango_default", + "title": "PANGO Lineage by Pangolin", + "type": "categorical" + }, + { + "key": "pango_usher", + "title": "PANGO Lineage by Usher", + "type": "categorical" + }, + { + "key": "S1_mutations", + "title": "S1 mutations", + "type": "continuous" + }, + { + "key": "GISAID_clade", + "title": "GISAID Clade", + "type": "categorical" + }, + { + "key": "subclade_membership", + "title": "Emerging clade", + "type": "categorical" + }, + { + "key": "region", + "title": "Region", + "type": "categorical" + }, + { + "key": "host", + "title": "Host", + "type": "categorical" + }, + { + "key": "age", + "title": "Age", + "type": "continuous" + }, + { + "key": "sex", + "title": "Sex", + "type": "categorical" + }, + { + "key": "author", + "title": "Authors", + "type": "categorical" + }, + { + "key": "originating_lab", + "title": "Originating Lab", + "type": "categorical" + }, + { + "key": "submitting_lab", + "title": "Submitting Lab", + "type": "categorical" + }, + { + "key": "recency", + "title": "Submission Date", + "type": "categorical" + }, + { + "key": "country_exposure", + "title": "Country of exposure", + "type": "categorical" + }, + { + "key": "division_exposure", + "title": "Division of exposure", + "type": "categorical" + }, + { + "key": "region_exposure", + "title": "Region of exposure", + "type": "categorical" + } + ], + "geo_resolutions": ["location", "division", "country", "region"], + "display_defaults": { + "color_by": "clade_membership", + "distance_measure": "num_date", + "geo_resolution": "country", + "map_triplicate": true, + "branch_label": "clade", + "transmission_lines": false + }, + "filters": [ + "recency", + "region", + "country", + "division", + "location", + "host", + "S1_mutations", + "pango_lineage", + "pango_default", + "pango_usher", + "clade_membership", + "emerging_lineage", + "author" + ], + "panels": ["tree", "map", "entropy", "frequencies"] +} diff --git a/profiles/pango/builds.yaml b/profiles/pango/builds.yaml new file mode 100644 index 0000000..7791537 --- /dev/null +++ b/profiles/pango/builds.yaml @@ -0,0 +1,37 @@ +files: + reference: "defaults/reference_seq.gb" + alignment_reference: "defaults/reference_seq.fasta" + annotation: "defaults/annotation.gff" + include: "defaults/include.txt" + color_schemes: "defaults/color_schemes.tsv" + description: "defaults/description.md" + clades: "builds/clades.tsv" + ordering: "builds/color_ordering.tsv" + lat_longs: "builds/lat_longs.tsv" + mut_fit: "builds/mutational_fitness_distance_map.json" + pango_designations: "builds/pango_designations.csv" + metadata_designated: "builds/metadata_designated.tsv" + auspice_config: "profiles/pango/auspice_config.json" + +build_dir: builds-pango +auspice_dir: auspice-pango + +builds: + pango: + deploy_urls: + - "s3://nextstrain-neherlab" + subsamples: + lineages: + filters: "--exclude-where lineage='undesignated' --group-by lineage --sequences-per-group 2" + medium_lineages: + filters: "--exclude-where lineage='undesignated' --group-by lineage --sequences-per-group 1 --min-date 2021-01-01" + young_lineages: + filters: "--exclude-where lineage='undesignated' --group-by lineage --sequences-per-group 1 --min-date 2021-06-01" + very_young_lineages: + filters: "--exclude-where lineage='undesignated' --group-by lineage --sequences-per-group 1 --min-date 2021-09-01" + clades: + filters: "--exclude-where lineage='undesignated' --group-by Nextstrain_clade --sequences-per-group 10" + delta: + filters: "--exclude-where lineage!='B.1.617.2' --group-by month --subsample-max-sequences 500 --min-date 2021-02-01" + recent: + filters: "--exclude-where lineage='undesignated' --subsample-max-sequences 500 --min-date 2021-08-01" diff --git a/profiles/pango/config.yaml b/profiles/pango/config.yaml new file mode 100644 index 0000000..2db00c8 --- /dev/null +++ b/profiles/pango/config.yaml @@ -0,0 +1,15 @@ +configfile: + - defaults/parameters.yaml # Pull in the default values + - profiles/pango/builds.yaml # Specific builds for this profile + - profiles/pango/secrets.yaml # Secrets not committed to git + +# Always print the commands that will be run to the screen for debugging. +printshellcmds: True +keep-going: True +reason: True +stats: stats.json + +# Print log files of failed jobs +show-failed-logs: True + +cores: 16 diff --git a/profiles/clades/description.md b/profiles/pango/description.md similarity index 69% rename from profiles/clades/description.md rename to profiles/pango/description.md index cde0eed..a541b5a 100644 --- a/profiles/clades/description.md +++ b/profiles/pango/description.md @@ -1,4 +1 @@ -This phylogeny shows evolutionary relationships of SARS-CoV-2 viruses from the ongoing novel coronavirus COVID-19 pandemic. - -All data we use were deposited in GISAID by scientists around the world. We gratefully acknowledge the authors, originating and submitting laboratories of the genetic sequence and metadata made available through [GISAID](https://gisaid.org) on which this research is based. A full listing of all originating and submitting laboratories is available below. An attribution table is available by clicking on "Download Data" at the bottom of the page and then clicking on "Strain Metadata" in the resulting dialog box. diff --git a/workflow/snakemake_rules/preprocess.smk b/workflow/snakemake_rules/preprocess.smk index 1e83194..4c70dad 100644 --- a/workflow/snakemake_rules/preprocess.smk +++ b/workflow/snakemake_rules/preprocess.smk @@ -12,7 +12,7 @@ and produces ''' import os -localrules: download_sequences, download_metadata, download_exclude, download_clades, preprocess, download_lat_longs, download_color_ordering, download_mutational_fitness_map +localrules: download_sequences, download_metadata, download_exclude, download_clades, preprocess, download_lat_longs, download_color_ordering, download_mutational_fitness_map, download_pango_designations rule preprocess: input: @@ -58,7 +58,7 @@ rule download_metadata: deflate = lambda w: _infer_decompression(config['origins'][w.origin]['metadata']), address = lambda w: config['origins'][w.origin]['metadata'] output: - metadata = "data/{origin}/metadata.tsv" + "data/{origin}/metadata_raw.tsv" shell: "aws s3 cp {params.address} - | {params.deflate} {input} > {output:q}" rule download_exclude: @@ -97,6 +97,29 @@ rule download_mutational_fitness_map: source = config["data_source"]["mut_fit"] shell: "curl {params.source} -o {output}" +rule download_pango_designations: + output: config["files"]["pango_designations"] + params: + source = config["data_source"]["pango_designations"] + shell: "curl {params.source} -o {output}" + +# TODO: Fix matching of strain names with whitespace +rule join_designations_and_metadata: + input: + designations = config["files"]["pango_designations"], + metadata = "pre-processed/metadata_raw.tsv", + output: + metadata = "pre-processed/metadata.tsv", + designations = "builds/pango_designations.tsv" + shell: + """ + csv2tsv < {input.designations} > {output.designations} && \ + tsv-join -H --filter-file {output.designations} \ + --key-fields taxon --data-fields strain --append-fields lineage {input.metadata} \ + --write-all undesignated \ + > {output.metadata} + """ + rule prealign: message: """ @@ -142,7 +165,7 @@ rule prealign: rule diagnostic: message: "Scanning metadata {input.metadata} for problematic sequences. Removing sequences with >{params.clock_filter} deviation from the clock and with more than {params.snp_clusters}." input: - metadata = "data/{origin}/metadata.tsv" + metadata = "data/{origin}/metadata_raw.tsv" output: to_exclude = "pre-processed/{origin}/problematic_exclude.txt" params: @@ -178,7 +201,7 @@ rule filter: """ input: sequences = "pre-processed/{origin}/alignment.fasta.xz", - metadata = "data/{origin}/metadata.tsv", + metadata = "data/{origin}/metadata_raw.tsv", include = "defaults/include.txt", exclude = "data/{origin}/exclude.txt", problematic = "pre-processed/{origin}/problematic_exclude.txt" @@ -220,9 +243,9 @@ rule combine_bulk_sequences: rule combine_bulk_metadata: input: - [f"data/{origin}/metadata.tsv" for origin in config["origins"]] + [f"data/{origin}/metadata_raw.tsv" for origin in config["origins"]] output: - rules.preprocess.input.metadata + "pre-processed/metadata_raw.tsv" run: if len(input)==1: shell(f"cp {input} {output}")