diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..c3fa6d7d7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,19 @@ +# Nextflow logs and metadata +*.nextflow.log* +/.nextflow/ + +# SLURM or scheduler logs +test-datasets-*.out +test-datasets-*.err + +# Shell scripts (optional, if not versioning run.sh) +*.sh + +# Work and results/vcfs +/work/ +/results/vcfs/ + +# Nextflow temporary execution files +*.command.* +*.Rout +*.tmp diff --git a/README.md b/README.md index 123415138..1d8b1befd 100644 --- a/README.md +++ b/README.md @@ -23,26 +23,32 @@ git clone -b gwas --single-branch git@github.com:USERNAME/test-datasets.git ## Documentation -nf-core/test-datasets comes with documentation in the `docs/` directory and scripts to generate the example data in the `scripts/` directory. +This test data comes from the 1000 Genomes Project phase3 release of variant calls. VCF files have been 'chunked' to include only the first 4,500 variants to reduce file sizes. Chromosome Y is excluded. Please see the datasets [README](https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/README_phase3_callset_20150220) for more details. Covariates and phenotypes were randomly generated for each sample in the VCF. + +nf-core/test-datasets comes with documentation in the `docs/` directory and the data can be generated running main.nf. ## Example data organisation -nf-core/test-datasets generated test data is located in the `data/` directory. +nf-core/test-datasets generated test data is located in the `results/` directory and includes the following structure. ``` -. -├── data_phenotypes_and_covariates -│   ├── example1.covar -│   └── example1.pheno -├── data_shrink_chunk_4500 -│   ├── chr10.vcf.bgz -│   ├── chr10.vcf.bgz.tbi -│   ├── chr11.vcf.bgz -│   ├── chr11.vcf.bgz.tbi -│ -└── data_shrink_combined_4500 - ├── chr1_to_22_and_X.vcf.bgz - └── chr1_to_22_and_X.vcf.bgz.tbi +results/ +├── chunked_vcfs/ +│   ├── chr1_chunked.vcf.gz +│   ├── chr1_chunked.vcf.gz.tbi +│   ├── chr2_chunked.vcf.gz +│   ├── chr2_chunked.vcf.gz.tbi +│   ├── ... +│   ├── chrX_chunked.vcf.gz +│   ├── chrX_chunked.vcf.gz.tbi +│   ├── combined_chunked.vcf.gz +│   └── combined_chunked.vcf.gz.tbi +├── pheno_cov/ +│   ├── example.pheno +│   └── example.covar + ``` +Each chromosome-specific VCF file (chr*.vcf.gz) is accompanied by its corresponding tabix index (.vcf.gz.tbi), enabling efficient querying. A combined VCF and index are also included for downstream association tests or visualization. + ## Support diff --git a/data/data_shrink_chunk_4500/chr1.vcf.bgz b/data/data_shrink_chunk_4500/chr1.vcf.bgz deleted file mode 100644 index c7882782a..000000000 Binary files a/data/data_shrink_chunk_4500/chr1.vcf.bgz and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chr1.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr1.vcf.bgz.tbi deleted file mode 100644 index 1a5ad1939..000000000 Binary files a/data/data_shrink_chunk_4500/chr1.vcf.bgz.tbi and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chr10.vcf.bgz b/data/data_shrink_chunk_4500/chr10.vcf.bgz deleted file mode 100644 index 87b58860b..000000000 Binary files a/data/data_shrink_chunk_4500/chr10.vcf.bgz and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chr10.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr10.vcf.bgz.tbi deleted file mode 100644 index f4b770c27..000000000 Binary files a/data/data_shrink_chunk_4500/chr10.vcf.bgz.tbi and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chr11.vcf.bgz b/data/data_shrink_chunk_4500/chr11.vcf.bgz deleted file mode 100644 index 732007d9f..000000000 Binary files a/data/data_shrink_chunk_4500/chr11.vcf.bgz and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chr11.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr11.vcf.bgz.tbi deleted file mode 100644 index c91c09a23..000000000 Binary files a/data/data_shrink_chunk_4500/chr11.vcf.bgz.tbi and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chr12.vcf.bgz b/data/data_shrink_chunk_4500/chr12.vcf.bgz deleted file mode 100644 index fe3625bd2..000000000 Binary files a/data/data_shrink_chunk_4500/chr12.vcf.bgz and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chr12.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr12.vcf.bgz.tbi deleted file mode 100644 index 643e74771..000000000 Binary files a/data/data_shrink_chunk_4500/chr12.vcf.bgz.tbi and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chr13.vcf.bgz b/data/data_shrink_chunk_4500/chr13.vcf.bgz deleted file mode 100644 index 01da982dc..000000000 Binary files a/data/data_shrink_chunk_4500/chr13.vcf.bgz and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chr13.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr13.vcf.bgz.tbi deleted file mode 100644 index 23defd16c..000000000 Binary files a/data/data_shrink_chunk_4500/chr13.vcf.bgz.tbi and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chr14.vcf.bgz b/data/data_shrink_chunk_4500/chr14.vcf.bgz deleted file mode 100644 index 1b341824a..000000000 Binary files a/data/data_shrink_chunk_4500/chr14.vcf.bgz and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chr14.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr14.vcf.bgz.tbi deleted file mode 100644 index 58159a141..000000000 Binary files a/data/data_shrink_chunk_4500/chr14.vcf.bgz.tbi and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chr15.vcf.bgz b/data/data_shrink_chunk_4500/chr15.vcf.bgz deleted file mode 100644 index 9bfd39cb9..000000000 Binary files a/data/data_shrink_chunk_4500/chr15.vcf.bgz and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chr15.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr15.vcf.bgz.tbi deleted file mode 100644 index f077edb5c..000000000 Binary files a/data/data_shrink_chunk_4500/chr15.vcf.bgz.tbi and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chr16.vcf.bgz b/data/data_shrink_chunk_4500/chr16.vcf.bgz deleted file mode 100644 index 830669709..000000000 Binary files a/data/data_shrink_chunk_4500/chr16.vcf.bgz and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chr16.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr16.vcf.bgz.tbi deleted file mode 100644 index bdc1490ef..000000000 Binary files a/data/data_shrink_chunk_4500/chr16.vcf.bgz.tbi and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chr17.vcf.bgz b/data/data_shrink_chunk_4500/chr17.vcf.bgz deleted file mode 100644 index 1e2256603..000000000 Binary files a/data/data_shrink_chunk_4500/chr17.vcf.bgz and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chr17.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr17.vcf.bgz.tbi deleted file mode 100644 index eb4620840..000000000 Binary files a/data/data_shrink_chunk_4500/chr17.vcf.bgz.tbi and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chr18.vcf.bgz b/data/data_shrink_chunk_4500/chr18.vcf.bgz deleted file mode 100644 index 80cd34c22..000000000 Binary files a/data/data_shrink_chunk_4500/chr18.vcf.bgz and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chr18.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr18.vcf.bgz.tbi deleted file mode 100644 index 48d15747d..000000000 Binary files a/data/data_shrink_chunk_4500/chr18.vcf.bgz.tbi and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chr19.vcf.bgz b/data/data_shrink_chunk_4500/chr19.vcf.bgz deleted file mode 100644 index 406bf5f59..000000000 Binary files a/data/data_shrink_chunk_4500/chr19.vcf.bgz and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chr19.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr19.vcf.bgz.tbi deleted file mode 100644 index b3bdaf0a0..000000000 Binary files a/data/data_shrink_chunk_4500/chr19.vcf.bgz.tbi and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chr2.vcf.bgz b/data/data_shrink_chunk_4500/chr2.vcf.bgz deleted file mode 100644 index 2cc15f154..000000000 Binary files a/data/data_shrink_chunk_4500/chr2.vcf.bgz and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chr2.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr2.vcf.bgz.tbi deleted file mode 100644 index 85a12aba5..000000000 Binary files a/data/data_shrink_chunk_4500/chr2.vcf.bgz.tbi and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chr20.vcf.bgz b/data/data_shrink_chunk_4500/chr20.vcf.bgz deleted file mode 100644 index f3732f0c9..000000000 Binary files a/data/data_shrink_chunk_4500/chr20.vcf.bgz and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chr20.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr20.vcf.bgz.tbi deleted file mode 100644 index 3d3794a0a..000000000 Binary files a/data/data_shrink_chunk_4500/chr20.vcf.bgz.tbi and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chr21.vcf.bgz b/data/data_shrink_chunk_4500/chr21.vcf.bgz deleted file mode 100644 index 2ed8b7e51..000000000 Binary files a/data/data_shrink_chunk_4500/chr21.vcf.bgz and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chr21.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr21.vcf.bgz.tbi deleted file mode 100644 index 9425e8e84..000000000 Binary files a/data/data_shrink_chunk_4500/chr21.vcf.bgz.tbi and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chr22.vcf.bgz b/data/data_shrink_chunk_4500/chr22.vcf.bgz deleted file mode 100644 index 6878b92e1..000000000 Binary files a/data/data_shrink_chunk_4500/chr22.vcf.bgz and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chr22.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr22.vcf.bgz.tbi deleted file mode 100644 index 05c7b7c48..000000000 Binary files a/data/data_shrink_chunk_4500/chr22.vcf.bgz.tbi and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chr3.vcf.bgz b/data/data_shrink_chunk_4500/chr3.vcf.bgz deleted file mode 100644 index c50539b0e..000000000 Binary files a/data/data_shrink_chunk_4500/chr3.vcf.bgz and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chr3.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr3.vcf.bgz.tbi deleted file mode 100644 index 3833426f4..000000000 Binary files a/data/data_shrink_chunk_4500/chr3.vcf.bgz.tbi and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chr4.vcf.bgz b/data/data_shrink_chunk_4500/chr4.vcf.bgz deleted file mode 100644 index cba7f44b0..000000000 Binary files a/data/data_shrink_chunk_4500/chr4.vcf.bgz and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chr4.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr4.vcf.bgz.tbi deleted file mode 100644 index fabff721e..000000000 Binary files a/data/data_shrink_chunk_4500/chr4.vcf.bgz.tbi and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chr5.vcf.bgz b/data/data_shrink_chunk_4500/chr5.vcf.bgz deleted file mode 100644 index 4fc2db1bd..000000000 Binary files a/data/data_shrink_chunk_4500/chr5.vcf.bgz and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chr5.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr5.vcf.bgz.tbi deleted file mode 100644 index f3d679fb9..000000000 Binary files a/data/data_shrink_chunk_4500/chr5.vcf.bgz.tbi and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chr6.vcf.bgz b/data/data_shrink_chunk_4500/chr6.vcf.bgz deleted file mode 100644 index 76366a683..000000000 Binary files a/data/data_shrink_chunk_4500/chr6.vcf.bgz and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chr6.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr6.vcf.bgz.tbi deleted file mode 100644 index cfaa263fe..000000000 Binary files a/data/data_shrink_chunk_4500/chr6.vcf.bgz.tbi and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chr7.vcf.bgz b/data/data_shrink_chunk_4500/chr7.vcf.bgz deleted file mode 100644 index 224587093..000000000 Binary files a/data/data_shrink_chunk_4500/chr7.vcf.bgz and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chr7.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr7.vcf.bgz.tbi deleted file mode 100644 index be9d3b0fd..000000000 Binary files a/data/data_shrink_chunk_4500/chr7.vcf.bgz.tbi and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chr8.vcf.bgz b/data/data_shrink_chunk_4500/chr8.vcf.bgz deleted file mode 100644 index 9466b4aaf..000000000 Binary files a/data/data_shrink_chunk_4500/chr8.vcf.bgz and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chr8.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr8.vcf.bgz.tbi deleted file mode 100644 index 9f1f4da98..000000000 Binary files a/data/data_shrink_chunk_4500/chr8.vcf.bgz.tbi and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chr9.vcf.bgz b/data/data_shrink_chunk_4500/chr9.vcf.bgz deleted file mode 100644 index b763c8aeb..000000000 Binary files a/data/data_shrink_chunk_4500/chr9.vcf.bgz and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chr9.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr9.vcf.bgz.tbi deleted file mode 100644 index d603532c2..000000000 Binary files a/data/data_shrink_chunk_4500/chr9.vcf.bgz.tbi and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chrX.vcf.bgz b/data/data_shrink_chunk_4500/chrX.vcf.bgz deleted file mode 100644 index 9e30288d0..000000000 Binary files a/data/data_shrink_chunk_4500/chrX.vcf.bgz and /dev/null differ diff --git a/data/data_shrink_chunk_4500/chrX.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chrX.vcf.bgz.tbi deleted file mode 100644 index e2d538e03..000000000 Binary files a/data/data_shrink_chunk_4500/chrX.vcf.bgz.tbi and /dev/null differ diff --git a/data/data_shrink_combined_4500/chr1_to_22_and_X.vcf.bgz.tbi b/data/data_shrink_combined_4500/chr1_to_22_and_X.vcf.bgz.tbi deleted file mode 100644 index c520c72dc..000000000 Binary files a/data/data_shrink_combined_4500/chr1_to_22_and_X.vcf.bgz.tbi and /dev/null differ diff --git a/main.nf b/main.nf new file mode 100644 index 000000000..4470703e7 --- /dev/null +++ b/main.nf @@ -0,0 +1,29 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl = 2 + +include { GENERATE_EXAMPLE_GENOTYPES_VCFS } from './modules/generate_example_genotypes_vcfs.nf' +include { CHUNK_VCFS } from './modules/chunk_vcfs.nf' +include { CONCAT_CHUNKED_VCFS } from './modules/concat_chunked_vcfs.nf' +include { EXTRACT_SAMPLE_IDS } from './modules/extract_sample_ids.nf' +include { GENERATE_PHENO_COV } from './modules/generate_pheno_cov.nf' +include { INDEX_CHUNKED_VCFS } from './modules/index_chunked_vcfs.nf' +workflow { + // Run the download process + GENERATE_EXAMPLE_GENOTYPES_VCFS() + + def vcfs_with_chr = GENERATE_EXAMPLE_GENOTYPES_VCFS.out.vcfs + .flatten() + .map { file -> + def chr = file.name.toString().split("\\.")[1] // safer than `tokenize` + tuple(chr, file) + } + + // Feed the tuples into the chunking process + CHUNK_VCFS(vcfs_with_chr) + CHUNK_VCFS.out.chunked_vcfs.collect().set {all_chunked_vcfs} + CONCAT_CHUNKED_VCFS(all_chunked_vcfs) + chr1_ch = channel.fromPath('./results/chunked_vcfs/chr1_chunked.vcf.gz') + EXTRACT_SAMPLE_IDS(chr1_ch) + GENERATE_PHENO_COV(EXTRACT_SAMPLE_IDS.out.sample_ids) + INDEX_CHUNKED_VCFS(CHUNK_VCFS.out.chunked_vcfs) +} diff --git a/modules/chunk_vcfs.nf b/modules/chunk_vcfs.nf new file mode 100644 index 000000000..2a76f79cb --- /dev/null +++ b/modules/chunk_vcfs.nf @@ -0,0 +1,16 @@ +process CHUNK_VCFS { + container "community.wave.seqera.io/library/bcftools_tabix_pip_tools:48085064a9189d8c" + publishDir params.outdir_chunked_vcfs, mode: 'copy' + + input: + tuple val(chr), path(vcfs) + + output: + path("${chr}_chunked.vcf.gz"), emit: chunked_vcfs + + script: + """ + bcftools view ${vcfs} | awk 'BEGIN {h=1; n=4500} /^#/ {print; next} {if (h <= n) {print; h++}}' | bgzip >${chr}_chunked.vcf.gz + tabix -p vcf ${chr}_chunked.vcf.gz + """ +} diff --git a/modules/concat_chunked_vcfs.nf b/modules/concat_chunked_vcfs.nf new file mode 100644 index 000000000..2d4fec4f0 --- /dev/null +++ b/modules/concat_chunked_vcfs.nf @@ -0,0 +1,20 @@ +process CONCAT_CHUNKED_VCFS { + container "community.wave.seqera.io/library/bcftools_tabix_pip_tools:48085064a9189d8c" + publishDir params.outdir_chunked_vcfs, mode: 'copy' + + input: + path vcf_files + + output: + path "combined_chunked.vcf.gz" + path "combined_chunked.vcf.gz.tbi" + + script: + """ + echo "VCFs to concat:" > concat_debug.txt + ls -lh ${vcf_files} >> concat_debug.txt + + bcftools concat -Oz -o combined_chunked.vcf.gz ${vcf_files.join(' ')} + tabix -p vcf combined_chunked.vcf.gz + """ +} diff --git a/modules/extract_sample_ids.nf b/modules/extract_sample_ids.nf new file mode 100644 index 000000000..c66a60509 --- /dev/null +++ b/modules/extract_sample_ids.nf @@ -0,0 +1,15 @@ +process EXTRACT_SAMPLE_IDS { + container "community.wave.seqera.io/library/r-base:4.4.3--1e564c44feffeaa0" + publishDir params.outdir_pheno_cov, mode: 'symlink' + + input: + path vcf_file + + output: + path "sample_ids.txt", emit: sample_ids + + script: + """ + zcat $vcf_file | grep '#CHROM' | cut -f10- | tr '\t' '\n' > sample_ids.txt + """ +} diff --git a/modules/generate_example_genotypes_vcfs.nf b/modules/generate_example_genotypes_vcfs.nf new file mode 100644 index 000000000..b001ad86a --- /dev/null +++ b/modules/generate_example_genotypes_vcfs.nf @@ -0,0 +1,16 @@ +process GENERATE_EXAMPLE_GENOTYPES_VCFS { + container "community.wave.seqera.io/library/bcftools_tabix_pip_tools:48085064a9189d8c" + publishDir params.outdir_vcfs, mode: 'symlink' + + output: + path "*.vcf.gz", emit: vcfs + + script: + """ + for chr in {1..22}; do + fname="ALL.chr\${chr}.phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gz" + curl -O https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/\$fname + done + curl -O https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.chrX.phase3_shapeit2_mvncall_integrated_v1c.20130502.genotypes.vcf.gz + """ +} diff --git a/modules/generate_pheno_cov.nf b/modules/generate_pheno_cov.nf new file mode 100644 index 000000000..cafb93376 --- /dev/null +++ b/modules/generate_pheno_cov.nf @@ -0,0 +1,38 @@ +process GENERATE_PHENO_COV { + container "community.wave.seqera.io/library/r-base:4.4.3--1e564c44feffeaa0" + publishDir params.outdir_pheno_cov, mode: 'copy' + + input: + path sample_ids + + output: + path "example.pheno" + path "example.covar" + + script: + """ + #!/usr/bin/env Rscript + #make a phenotype + #Here, a not too bad tutorial on different techniques on how to simulate data + # https://aosmith.rbind.io/2018/08/29/getting-started-simulating-data/ + + #Here I used the blog's proposed way of simulating data for a regression analysis + #We will use the generated data slightly different, but hopefully good enough to + # actually get some results + ids <- readLines("${sample_ids}") + n <- length(ids) + set.seed(16) + y = rnorm(n = n, mean = 0, sd = 1) + x1 = runif(n = n, min = 1, max = 2) + x2 = runif(n = n, min = 200, max = 300) + + # Write this to one phenodata file and one covardata file + # first column, unique ids, second column family ids, remaining columns are + # phenotyp or covariate columns (here individual IDs are family IDs) + example.pheno <- data.frame(ids=ids, fam=ids, pheno=y) + example.covar <- data.frame(ids=ids, fam=ids, cov1=x1, cov2=x2) + # Write to tab-delimited files without headers or row names + write.table(example.pheno, file = "example.pheno", sep = "\\t", quote = FALSE, row.names = FALSE, col.names = FALSE) + write.table(example.covar, file = "example.covar", sep = "\\t", quote = FALSE, row.names = FALSE, col.names = FALSE) + """ +} diff --git a/modules/index_chunked_vcfs.nf b/modules/index_chunked_vcfs.nf new file mode 100644 index 000000000..a52f2cab4 --- /dev/null +++ b/modules/index_chunked_vcfs.nf @@ -0,0 +1,15 @@ +process INDEX_CHUNKED_VCFS { + container "community.wave.seqera.io/library/bcftools_tabix_pip_tools:48085064a9189d8c" + publishDir params.outdir_chunked_vcfs, mode: 'copy' + + input: + path vcf_files + + output: + path "*.vcf.gz.tbi", emit: indexed_vcfs + + script: + """ + tabix -p vcf ${vcf_files} + """ +} diff --git a/nextflow.config b/nextflow.config new file mode 100644 index 000000000..e211bc615 --- /dev/null +++ b/nextflow.config @@ -0,0 +1,11 @@ +params { + outdir_base = "results" + outdir_vcfs = "${params.outdir_base}/vcfs" + outdir_chunked_vcfs = "${params.outdir_base}/chunked_vcfs" + outdir_pheno_cov = "${params.outdir_base}/pheno_cov" +} + +singularity { + enabled = true + autoMounts = true +} diff --git a/results/chunked_vcfs/chr10_chunked.vcf.gz b/results/chunked_vcfs/chr10_chunked.vcf.gz new file mode 100644 index 000000000..859f6a1c9 Binary files /dev/null and b/results/chunked_vcfs/chr10_chunked.vcf.gz differ diff --git a/results/chunked_vcfs/chr10_chunked.vcf.gz.tbi b/results/chunked_vcfs/chr10_chunked.vcf.gz.tbi new file mode 100644 index 000000000..d6dfc97b5 Binary files /dev/null and b/results/chunked_vcfs/chr10_chunked.vcf.gz.tbi differ diff --git a/results/chunked_vcfs/chr11_chunked.vcf.gz b/results/chunked_vcfs/chr11_chunked.vcf.gz new file mode 100644 index 000000000..61b5ee559 Binary files /dev/null and b/results/chunked_vcfs/chr11_chunked.vcf.gz differ diff --git a/results/chunked_vcfs/chr11_chunked.vcf.gz.tbi b/results/chunked_vcfs/chr11_chunked.vcf.gz.tbi new file mode 100644 index 000000000..2f3e9049e Binary files /dev/null and b/results/chunked_vcfs/chr11_chunked.vcf.gz.tbi differ diff --git a/results/chunked_vcfs/chr12_chunked.vcf.gz b/results/chunked_vcfs/chr12_chunked.vcf.gz new file mode 100644 index 000000000..84c1e7562 Binary files /dev/null and b/results/chunked_vcfs/chr12_chunked.vcf.gz differ diff --git a/results/chunked_vcfs/chr12_chunked.vcf.gz.tbi b/results/chunked_vcfs/chr12_chunked.vcf.gz.tbi new file mode 100644 index 000000000..037541291 Binary files /dev/null and b/results/chunked_vcfs/chr12_chunked.vcf.gz.tbi differ diff --git a/results/chunked_vcfs/chr13_chunked.vcf.gz b/results/chunked_vcfs/chr13_chunked.vcf.gz new file mode 100644 index 000000000..15cb38bfa Binary files /dev/null and b/results/chunked_vcfs/chr13_chunked.vcf.gz differ diff --git a/results/chunked_vcfs/chr13_chunked.vcf.gz.tbi b/results/chunked_vcfs/chr13_chunked.vcf.gz.tbi new file mode 100644 index 000000000..1da8dd315 Binary files /dev/null and b/results/chunked_vcfs/chr13_chunked.vcf.gz.tbi differ diff --git a/results/chunked_vcfs/chr14_chunked.vcf.gz b/results/chunked_vcfs/chr14_chunked.vcf.gz new file mode 100644 index 000000000..ba47a723e Binary files /dev/null and b/results/chunked_vcfs/chr14_chunked.vcf.gz differ diff --git a/results/chunked_vcfs/chr14_chunked.vcf.gz.tbi b/results/chunked_vcfs/chr14_chunked.vcf.gz.tbi new file mode 100644 index 000000000..5a7a4db4f Binary files /dev/null and b/results/chunked_vcfs/chr14_chunked.vcf.gz.tbi differ diff --git a/results/chunked_vcfs/chr15_chunked.vcf.gz b/results/chunked_vcfs/chr15_chunked.vcf.gz new file mode 100644 index 000000000..f432bac9e Binary files /dev/null and b/results/chunked_vcfs/chr15_chunked.vcf.gz differ diff --git a/results/chunked_vcfs/chr15_chunked.vcf.gz.tbi b/results/chunked_vcfs/chr15_chunked.vcf.gz.tbi new file mode 100644 index 000000000..b4795ed80 Binary files /dev/null and b/results/chunked_vcfs/chr15_chunked.vcf.gz.tbi differ diff --git a/results/chunked_vcfs/chr16_chunked.vcf.gz b/results/chunked_vcfs/chr16_chunked.vcf.gz new file mode 100644 index 000000000..cf512f63f Binary files /dev/null and b/results/chunked_vcfs/chr16_chunked.vcf.gz differ diff --git a/results/chunked_vcfs/chr16_chunked.vcf.gz.tbi b/results/chunked_vcfs/chr16_chunked.vcf.gz.tbi new file mode 100644 index 000000000..d7d01b5be Binary files /dev/null and b/results/chunked_vcfs/chr16_chunked.vcf.gz.tbi differ diff --git a/results/chunked_vcfs/chr17_chunked.vcf.gz b/results/chunked_vcfs/chr17_chunked.vcf.gz new file mode 100644 index 000000000..3d376f4ec Binary files /dev/null and b/results/chunked_vcfs/chr17_chunked.vcf.gz differ diff --git a/results/chunked_vcfs/chr17_chunked.vcf.gz.tbi b/results/chunked_vcfs/chr17_chunked.vcf.gz.tbi new file mode 100644 index 000000000..1b2ccbb7e Binary files /dev/null and b/results/chunked_vcfs/chr17_chunked.vcf.gz.tbi differ diff --git a/results/chunked_vcfs/chr18_chunked.vcf.gz b/results/chunked_vcfs/chr18_chunked.vcf.gz new file mode 100644 index 000000000..92d0ca79f Binary files /dev/null and b/results/chunked_vcfs/chr18_chunked.vcf.gz differ diff --git a/results/chunked_vcfs/chr18_chunked.vcf.gz.tbi b/results/chunked_vcfs/chr18_chunked.vcf.gz.tbi new file mode 100644 index 000000000..db02f4f44 Binary files /dev/null and b/results/chunked_vcfs/chr18_chunked.vcf.gz.tbi differ diff --git a/results/chunked_vcfs/chr19_chunked.vcf.gz b/results/chunked_vcfs/chr19_chunked.vcf.gz new file mode 100644 index 000000000..486c38852 Binary files /dev/null and b/results/chunked_vcfs/chr19_chunked.vcf.gz differ diff --git a/results/chunked_vcfs/chr19_chunked.vcf.gz.tbi b/results/chunked_vcfs/chr19_chunked.vcf.gz.tbi new file mode 100644 index 000000000..de91de1b1 Binary files /dev/null and b/results/chunked_vcfs/chr19_chunked.vcf.gz.tbi differ diff --git a/results/chunked_vcfs/chr1_chunked.vcf.gz b/results/chunked_vcfs/chr1_chunked.vcf.gz new file mode 100644 index 000000000..2fffa8501 Binary files /dev/null and b/results/chunked_vcfs/chr1_chunked.vcf.gz differ diff --git a/results/chunked_vcfs/chr1_chunked.vcf.gz.tbi b/results/chunked_vcfs/chr1_chunked.vcf.gz.tbi new file mode 100644 index 000000000..cded4f1ef Binary files /dev/null and b/results/chunked_vcfs/chr1_chunked.vcf.gz.tbi differ diff --git a/results/chunked_vcfs/chr20_chunked.vcf.gz b/results/chunked_vcfs/chr20_chunked.vcf.gz new file mode 100644 index 000000000..b8ef5c426 Binary files /dev/null and b/results/chunked_vcfs/chr20_chunked.vcf.gz differ diff --git a/results/chunked_vcfs/chr20_chunked.vcf.gz.tbi b/results/chunked_vcfs/chr20_chunked.vcf.gz.tbi new file mode 100644 index 000000000..3d41a667d Binary files /dev/null and b/results/chunked_vcfs/chr20_chunked.vcf.gz.tbi differ diff --git a/results/chunked_vcfs/chr21_chunked.vcf.gz b/results/chunked_vcfs/chr21_chunked.vcf.gz new file mode 100644 index 000000000..c04700e49 Binary files /dev/null and b/results/chunked_vcfs/chr21_chunked.vcf.gz differ diff --git a/results/chunked_vcfs/chr21_chunked.vcf.gz.tbi b/results/chunked_vcfs/chr21_chunked.vcf.gz.tbi new file mode 100644 index 000000000..b57487f38 Binary files /dev/null and b/results/chunked_vcfs/chr21_chunked.vcf.gz.tbi differ diff --git a/results/chunked_vcfs/chr22_chunked.vcf.gz b/results/chunked_vcfs/chr22_chunked.vcf.gz new file mode 100644 index 000000000..1a4910bb8 Binary files /dev/null and b/results/chunked_vcfs/chr22_chunked.vcf.gz differ diff --git a/results/chunked_vcfs/chr22_chunked.vcf.gz.tbi b/results/chunked_vcfs/chr22_chunked.vcf.gz.tbi new file mode 100644 index 000000000..e100cc221 Binary files /dev/null and b/results/chunked_vcfs/chr22_chunked.vcf.gz.tbi differ diff --git a/results/chunked_vcfs/chr2_chunked.vcf.gz b/results/chunked_vcfs/chr2_chunked.vcf.gz new file mode 100644 index 000000000..2173c33ae Binary files /dev/null and b/results/chunked_vcfs/chr2_chunked.vcf.gz differ diff --git a/results/chunked_vcfs/chr2_chunked.vcf.gz.tbi b/results/chunked_vcfs/chr2_chunked.vcf.gz.tbi new file mode 100644 index 000000000..a17390b0c Binary files /dev/null and b/results/chunked_vcfs/chr2_chunked.vcf.gz.tbi differ diff --git a/results/chunked_vcfs/chr3_chunked.vcf.gz b/results/chunked_vcfs/chr3_chunked.vcf.gz new file mode 100644 index 000000000..d056fcc92 Binary files /dev/null and b/results/chunked_vcfs/chr3_chunked.vcf.gz differ diff --git a/results/chunked_vcfs/chr3_chunked.vcf.gz.tbi b/results/chunked_vcfs/chr3_chunked.vcf.gz.tbi new file mode 100644 index 000000000..fdd6362f3 Binary files /dev/null and b/results/chunked_vcfs/chr3_chunked.vcf.gz.tbi differ diff --git a/results/chunked_vcfs/chr4_chunked.vcf.gz b/results/chunked_vcfs/chr4_chunked.vcf.gz new file mode 100644 index 000000000..543e82251 Binary files /dev/null and b/results/chunked_vcfs/chr4_chunked.vcf.gz differ diff --git a/results/chunked_vcfs/chr4_chunked.vcf.gz.tbi b/results/chunked_vcfs/chr4_chunked.vcf.gz.tbi new file mode 100644 index 000000000..25d3fdca9 Binary files /dev/null and b/results/chunked_vcfs/chr4_chunked.vcf.gz.tbi differ diff --git a/results/chunked_vcfs/chr5_chunked.vcf.gz b/results/chunked_vcfs/chr5_chunked.vcf.gz new file mode 100644 index 000000000..4a0d18c87 Binary files /dev/null and b/results/chunked_vcfs/chr5_chunked.vcf.gz differ diff --git a/results/chunked_vcfs/chr5_chunked.vcf.gz.tbi b/results/chunked_vcfs/chr5_chunked.vcf.gz.tbi new file mode 100644 index 000000000..95bf078e6 Binary files /dev/null and b/results/chunked_vcfs/chr5_chunked.vcf.gz.tbi differ diff --git a/results/chunked_vcfs/chr6_chunked.vcf.gz b/results/chunked_vcfs/chr6_chunked.vcf.gz new file mode 100644 index 000000000..a486ec8e2 Binary files /dev/null and b/results/chunked_vcfs/chr6_chunked.vcf.gz differ diff --git a/results/chunked_vcfs/chr6_chunked.vcf.gz.tbi b/results/chunked_vcfs/chr6_chunked.vcf.gz.tbi new file mode 100644 index 000000000..4b0384b0b Binary files /dev/null and b/results/chunked_vcfs/chr6_chunked.vcf.gz.tbi differ diff --git a/results/chunked_vcfs/chr7_chunked.vcf.gz b/results/chunked_vcfs/chr7_chunked.vcf.gz new file mode 100644 index 000000000..274c8ad20 Binary files /dev/null and b/results/chunked_vcfs/chr7_chunked.vcf.gz differ diff --git a/results/chunked_vcfs/chr7_chunked.vcf.gz.tbi b/results/chunked_vcfs/chr7_chunked.vcf.gz.tbi new file mode 100644 index 000000000..e042fedd4 Binary files /dev/null and b/results/chunked_vcfs/chr7_chunked.vcf.gz.tbi differ diff --git a/results/chunked_vcfs/chr8_chunked.vcf.gz b/results/chunked_vcfs/chr8_chunked.vcf.gz new file mode 100644 index 000000000..8d898a8e2 Binary files /dev/null and b/results/chunked_vcfs/chr8_chunked.vcf.gz differ diff --git a/results/chunked_vcfs/chr8_chunked.vcf.gz.tbi b/results/chunked_vcfs/chr8_chunked.vcf.gz.tbi new file mode 100644 index 000000000..297ff69c0 Binary files /dev/null and b/results/chunked_vcfs/chr8_chunked.vcf.gz.tbi differ diff --git a/results/chunked_vcfs/chr9_chunked.vcf.gz b/results/chunked_vcfs/chr9_chunked.vcf.gz new file mode 100644 index 000000000..2248c1601 Binary files /dev/null and b/results/chunked_vcfs/chr9_chunked.vcf.gz differ diff --git a/results/chunked_vcfs/chr9_chunked.vcf.gz.tbi b/results/chunked_vcfs/chr9_chunked.vcf.gz.tbi new file mode 100644 index 000000000..2e1256ab7 Binary files /dev/null and b/results/chunked_vcfs/chr9_chunked.vcf.gz.tbi differ diff --git a/results/chunked_vcfs/chrX_chunked.vcf.gz b/results/chunked_vcfs/chrX_chunked.vcf.gz new file mode 100644 index 000000000..e6eba0412 Binary files /dev/null and b/results/chunked_vcfs/chrX_chunked.vcf.gz differ diff --git a/results/chunked_vcfs/chrX_chunked.vcf.gz.tbi b/results/chunked_vcfs/chrX_chunked.vcf.gz.tbi new file mode 100644 index 000000000..8884c97d5 Binary files /dev/null and b/results/chunked_vcfs/chrX_chunked.vcf.gz.tbi differ diff --git a/data/data_shrink_combined_4500/chr1_to_22_and_X.vcf.bgz b/results/chunked_vcfs/combined_chunked.vcf.gz similarity index 64% rename from data/data_shrink_combined_4500/chr1_to_22_and_X.vcf.bgz rename to results/chunked_vcfs/combined_chunked.vcf.gz index 069fc1cd8..8e9a98fe1 100644 Binary files a/data/data_shrink_combined_4500/chr1_to_22_and_X.vcf.bgz and b/results/chunked_vcfs/combined_chunked.vcf.gz differ diff --git a/results/chunked_vcfs/combined_chunked.vcf.gz.tbi b/results/chunked_vcfs/combined_chunked.vcf.gz.tbi new file mode 100644 index 000000000..bd261a461 Binary files /dev/null and b/results/chunked_vcfs/combined_chunked.vcf.gz.tbi differ diff --git a/data/data_phenotypes_and_covariates/example1.covar b/results/pheno_cov/example.covar similarity index 100% rename from data/data_phenotypes_and_covariates/example1.covar rename to results/pheno_cov/example.covar diff --git a/data/data_phenotypes_and_covariates/example1.pheno b/results/pheno_cov/example.pheno similarity index 100% rename from data/data_phenotypes_and_covariates/example1.pheno rename to results/pheno_cov/example.pheno diff --git a/results/pheno_cov/sample_ids.txt b/results/pheno_cov/sample_ids.txt new file mode 120000 index 000000000..f81e16c29 --- /dev/null +++ b/results/pheno_cov/sample_ids.txt @@ -0,0 +1 @@ +/projects/academic/rpili/test-datasets/work/2e/954742e111e39708e7d33295f45594/sample_ids.txt \ No newline at end of file diff --git a/scripts/generate-example-data-pheno-and-covar.R b/scripts/generate-example-data-pheno-and-covar.R deleted file mode 100644 index 3e0368a29..000000000 --- a/scripts/generate-example-data-pheno-and-covar.R +++ /dev/null @@ -1,42 +0,0 @@ -#Using same IDs as present in the genotype file we make an artificial phenotyp and covariate file - -#path to genotype vcf file -pathToVcf <- "data/data_shrink_chunk_4500/chr22.vcf.bgz" - -#Get sample name -header <- system(paste("zcat ", pathToVcf, " | grep '#CHROM'", sep=""), intern=TRUE) -header <- unlist(strsplit(header, split="\t")) -#sampleIDs -ids <- header[10:length(header)] - -#make a phenotype -#Here, a not too bad tutorial on different techniques on how to simulate data -# https://aosmith.rbind.io/2018/08/29/getting-started-simulating-data/ - -#Here I used the blog's proposed way of simulating data for a regression analysis -#We will use the generated data slightly different, but hopefully good enough to -# actually get som results -n <- length(ids) -set.seed(16) -y = rnorm(n = n, mean = 0, sd = 1) -x1 = runif(n = n, min = 1, max = 2) -x2 = runif(n = n, min = 200, max = 300) - -# Write this to one phenodata file and one covardata file -# first column, unique ids, second column family ids, remaining columns are -# phenotyp or covariate columns (here individual IDs are family IDs) -phe <- data.frame(ids=ids, fam=ids, pheno=y) -cov <- data.frame(ids=ids, fam=ids, cov1=x1, cov2=x2) - -#write to file -outDir <- "data/data_phenotypes_and_covariates" -write.table(phe, file = paste(outDir,"/example1.pheno", sep=""), append = FALSE, quote = FALSE, sep = "\t", - eol = "\n", na = "NA", dec = ".", row.names = FALSE, - col.names = FALSE, qmethod = c("escape", "double"), - fileEncoding = "") -write.table(cov, file = paste(outDir,"/example1.covar", sep=""), append = FALSE, quote = FALSE, sep = "\t", - eol = "\n", na = "NA", dec = ".", row.names = FALSE, - col.names = FALSE, qmethod = c("escape", "double"), - fileEncoding = "") - - diff --git a/scripts/generate-example-genotype-vcfs.sh b/scripts/generate-example-genotype-vcfs.sh deleted file mode 100644 index e392bb940..000000000 --- a/scripts/generate-example-genotype-vcfs.sh +++ /dev/null @@ -1,36 +0,0 @@ -# Use 1000 genomes to use as as shrinked set, in chunks and in a combined form. -# A data reseource, meant to be used for development and testing. - -# Download 22+X chromosomes -source_fold="data/data_source" -mkdir -p ${source_fold} -for chr in {1..22}; do - echo ${chr} - wget -P ${source_fold} ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.chr${chr}.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz* -done -wget -P ${source_fold} ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.chrX.phase3_shapeit2_mvncall_integrated_v1b.20130502.genotypes.vcf.gz* - -# Load modules (how software is installed on HPC) -module load tools -module load bcftools/1.9 -module load tabix/1.2.1 - -# Shrink files to only contain 4500 lines each (to not take too much space) -chunk_fold="data/data_shrink_chunk_4500" -mkdir -p ${chunk_fold} -for chr in {1..22}; do - fileToRead="data_source/ALL.chr${chr}.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz" - bcftools view -Ov ${fileToRead} | head -n4500 | bgzip -c > ${chunk_fold}/chr${chr}.vcf.bgz - tabix -p vcf ${chunk_fold}/chr${chr}.vcf.bgz -done -fileToRead="data_source/ALL.chrX.phase3_shapeit2_mvncall_integrated_v1b.20130502.genotypes.vcf.gz" -bcftools view -Ov ${fileToRead} | head -n4500 | bgzip -c > ${chunk_fold}/chrX.vcf.bgz -tabix -p vcf ${chunk_fold}/chrX.vcf.bgz - -# Use bcftools to combine the shrinked data -combine_fold="data/data_shrink_combined_4500" -mkdir -p ${combine_fold} -bcftools concat -Ov ${chunk_fold}/*.vcf.bgz | bgzip -c > ${combine_fold}/chr1_to_22_and_X.vcf.bgz -tabix -p vcf ${combine_fold}/chr1_to_22_and_X.vcf.bgz - -