Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 0 additions & 72 deletions validation/validator/allowed_data_types.txt

This file was deleted.

Empty file modified validation/validator/cbioportalImporter.py
100755 → 100644
Empty file.
135 changes: 87 additions & 48 deletions validation/validator/cbioportal_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
from subprocess import Popen, PIPE, STDOUT
from typing import Dict, Optional
import dsnparse
import MySQLdb
import pymysql
pymysql.install_as_MySQLdb()


# ------------------------------------------------------------------------------
# globals
Expand Down Expand Up @@ -737,59 +739,96 @@ def get_meta_file_type(meta_dictionary, logger, filename):


def validate_types_and_id(meta_dictionary, logger, filename):
"""Validate a genetic_alteration_type, datatype (and stable_id in some cases) against the predefined
allowed combinations found in ./allowed_data_types.txt
"""Validate a genetic_alteration_type, datatype (and stable_id in some cases)
against predefined allowed combinations instead of reading from allowed_data_types.txt.
"""
result = True
# this validation only applies to items that have genetic_alteration_type and datatype and stable_id
if 'genetic_alteration_type' in meta_dictionary and 'datatype' in meta_dictionary and 'stable_id' in meta_dictionary:
alt_type_datatype_and_stable_id = {}
script_dir = os.path.dirname(__file__)
allowed_data_types_file_name = os.path.join(script_dir, "allowed_data_types.txt")
data_line_nr = 0
# build up map alt_type_datatype_and_stable_id:
with open(allowed_data_types_file_name) as allowed_data_types_file:
for line in allowed_data_types_file:
if line.startswith("#"):
continue
data_line_nr += 1
# skip header, so if line is not header then process as tab separated:
if (data_line_nr > 1):
line_cols = next(csv.reader([line], delimiter='\t'))
genetic_alteration_type = line_cols[0]
data_type = line_cols[1]
# add to map:
if (genetic_alteration_type, data_type) not in alt_type_datatype_and_stable_id:
alt_type_datatype_and_stable_id[(genetic_alteration_type, data_type)] = []
alt_type_datatype_and_stable_id[(genetic_alteration_type, data_type)].append(line_cols[2])
# init:
stable_id = meta_dictionary['stable_id']
genetic_alteration_type = meta_dictionary['genetic_alteration_type']
data_type = meta_dictionary['datatype']
# validate the genetic_alteration_type/data_type combination:
if (genetic_alteration_type, data_type) not in alt_type_datatype_and_stable_id:
# unexpected as this is already validated in get_meta_file_type
raise RuntimeError('Unexpected error: genetic_alteration_type and data_type combination not found in allowed_data_types.txt.',
genetic_alteration_type, data_type)
# Check whether a wild card ('*') is set in allowed_data_types.txt for the alteration type-data type combination.
# For these entries the stable_id is not validated, but assumed to be checked for uniqueness by the user.
elif alt_type_datatype_and_stable_id[(genetic_alteration_type, data_type)][0] == "*":
pass
# validate stable_id:
elif stable_id not in alt_type_datatype_and_stable_id[(genetic_alteration_type, data_type)]:
logger.error("Invalid stable id for genetic_alteration_type '%s', "
"data_type '%s'; expected one of [%s]",
genetic_alteration_type,
data_type,
', '.join(alt_type_datatype_and_stable_id[(genetic_alteration_type, data_type)]),
extra={'filename_': filename,
'cause': stable_id}
)

# Define allowed combinations as a dictionary
allowed_combinations = {
("COPY_NUMBER_ALTERATION", "DISCRETE", "cna"),
("COPY_NUMBER_ALTERATION", "DISCRETE_LONG", "cna"),
("COPY_NUMBER_ALTERATION", "DISCRETE", "cna_rae"),
("COPY_NUMBER_ALTERATION", "DISCRETE", "cna_consensus"),
("COPY_NUMBER_ALTERATION", "DISCRETE", "gistic"),
("COPY_NUMBER_ALTERATION", "DISCRETE_LONG", "gistic"),
("COPY_NUMBER_ALTERATION", "CONTINUOUS", "linear_CNA"),
("COPY_NUMBER_ALTERATION", "LOG2-VALUE", "log2CNA"),
("MUTATION_EXTENDED", "MAF", "mutations"),
("MUTATION_UNCALLED", "MAF", "mutations_uncalled"),
("MRNA_EXPRESSION", "CONTINUOUS", "mrna_U133"),
("MRNA_EXPRESSION", "Z-SCORE", "mrna_U133_Zscores"),
("MRNA_EXPRESSION", "Z-SCORE", "mrna_U133_all_sample_Zscores"),
("MRNA_EXPRESSION", "Z-SCORE", "rna_seq_mrna_median_Zscores"),
("MRNA_EXPRESSION", "Z-SCORE", "rna_seq_mrna_median_all_sample_Zscores"),
("MRNA_EXPRESSION", "Z-SCORE", "mrna_median_Zscores"),
("MRNA_EXPRESSION", "CONTINUOUS", "rna_seq_mrna"),
("MRNA_EXPRESSION", "CONTINUOUS", "rna_seq_v2_mrna"),
("MRNA_EXPRESSION", "Z-SCORE", "rna_seq_v2_mrna_median_Zscores"),
("MRNA_EXPRESSION", "Z-SCORE", "rna_seq_v2_mrna_median_all_sample_Zscores"),
("MRNA_EXPRESSION", "CONTINUOUS", "rna_seq_v2_mrna_median_normals"),
("MRNA_EXPRESSION", "Z-SCORE", "rna_seq_v2_mrna_median_normals_Zscores"),
("MRNA_EXPRESSION", "Z-SCORE", "rna_seq_v2_mrna_median_all_sample_ref_normal_Zscores"),
("MRNA_EXPRESSION", "CONTINUOUS", "mirna"),
("MRNA_EXPRESSION", "Z-SCORE", "mirna_median_Zscores"),
("MRNA_EXPRESSION", "Z-SCORE", "mrna_merged_median_Zscores"),
("MRNA_EXPRESSION", "CONTINUOUS", "mrna"),
("MRNA_EXPRESSION", "Z-SCORE", "mrna_median_Zscores"),
("MRNA_EXPRESSION", "Z-SCORE", "mrna_median_all_sample_Zscores"),
("MRNA_EXPRESSION", "Z-SCORE", "mrna_seq_fpkm_Zscores"),
("MRNA_EXPRESSION", "Z-SCORE", "mrna_seq_fpkm_all_sample_Zscores"),
("MRNA_EXPRESSION", "Z-SCORE", "mrna_median_all_sample_Zscores"),
("MRNA_EXPRESSION", "DISCRETE", "mrna_outliers"),
("MRNA_EXPRESSION", "CONTINUOUS", "mrna_seq_fpkm_capture"),
("MRNA_EXPRESSION", "Z-SCORE", "mrna_seq_fpkm_capture_Zscores"),
("MRNA_EXPRESSION", "Z-SCORE", "mrna_seq_fpkm_capture_all_sample_Zscores"),
("MRNA_EXPRESSION", "CONTINUOUS", "rna_seq_mrna_capture"),
("MRNA_EXPRESSION", "Z-SCORE", "rna_seq_mrna_capture_Zscores"),
("MRNA_EXPRESSION", "Z-SCORE", "rna_seq_mrna_capture_all_sample_Zscores"),
("MRNA_EXPRESSION", "Z-SCORE", "mrna_seq_fpkm_polya_all_sample_Zscores"),
("MRNA_EXPRESSION", "Z-SCORE", "mrna_seq_fpkm_polya_Zscores"),
("MRNA_EXPRESSION", "CONTINUOUS", "mrna_seq_fpkm_polya"),
("MRNA_EXPRESSION", "CONTINUOUS", "mrna_seq_cpm"),
("MRNA_EXPRESSION", "Z-SCORE", "mrna_seq_cpm_all_sample_Zscores"),
("MRNA_EXPRESSION", "Z-SCORE", "mrna_seq_cpm_Zscores"),
("MRNA_EXPRESSION", "CONTINUOUS", "mrna_seq_tpm"),
("MRNA_EXPRESSION", "Z-SCORE", "mrna_seq_tpm_all_sample_Zscores"),
("MRNA_EXPRESSION", "Z-SCORE", "mrna_seq_tpm_Zscores"),
("METHYLATION", "CONTINUOUS", "methylation_hm27"),
("METHYLATION", "CONTINUOUS", "methylation_hm450"),
("METHYLATION", "CONTINUOUS", "methylation_epic"),
("PROTEIN_LEVEL", "LOG2-VALUE", "rppa"),
("PROTEIN_LEVEL", "Z-SCORE", "rppa_Zscores"),
("PROTEIN_LEVEL", "CONTINUOUS", "protein_quantification"),
("PROTEIN_LEVEL", "Z-SCORE", "protein_quantification_zscores"),
("PROTEIN_LEVEL", "LOG2-VALUE", "protein_quantification"),
("GENESET_SCORE", "GSVA-SCORE", "gsva_scores"),
("GENESET_SCORE", "P-VALUE", "gsva_pvalues"),
("GENERIC_ASSAY", "LIMIT-VALUE", "*"),
("GENERIC_ASSAY", "BINARY", "*"),
("GENERIC_ASSAY", "CATEGORICAL", "*"),
("STRUCTURAL_VARIANT", "SV", "structural_variants")
}

# Extract required fields
if "genetic_alteration_type" in meta_dictionary and "datatype" in meta_dictionary and "stable_id" in meta_dictionary:
genetic_alteration_type = meta_dictionary["genetic_alteration_type"]
data_type = meta_dictionary["datatype"]
stable_id = meta_dictionary["stable_id"]

# Validate the combination
if (genetic_alteration_type, data_type, stable_id) not in allowed_combinations and \
(genetic_alteration_type, data_type, "*") not in allowed_combinations:
logger.error(
"Invalid genetic_alteration_type, datatype, and stable_id combination: (%s, %s, %s). "
"Please check your meta files.",
genetic_alteration_type, data_type, stable_id,
extra={'filename_': filename}
)
result = False

return result


def parse_metadata_file(filename,
logger,
study_id=None,
Expand Down
Empty file modified validation/validator/metaImport.py
100755 → 100644
Empty file.
59 changes: 39 additions & 20 deletions validation/validator/validateData.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@
MAX_SAMPLE_STABLE_ID_LENGTH = 63

# global variable that defines the invalid ID characters
INVALID_ID_CHARACTERS = r"[^A-Za-z0-9._\(\)'+-]"
INVALID_ID_CHARACTERS = r"[^A-Za-z0-9._\(\)',+-[:];]"

# ----------------------------------------------------------------------------

Expand Down Expand Up @@ -1682,6 +1682,7 @@ class MutationsExtendedValidator(CustomDriverAnnotationValidator, CustomNamespac
def __init__(self, *args, **kwargs):
super(MutationsExtendedValidator, self).__init__(*args, **kwargs)
self.extraCols = []
self.seen_mutations = set() # Store seen mutations for duplicate detection

def checkHeader(self, cols):
"""Validate header, requiring at least one gene id column."""
Expand Down Expand Up @@ -1745,6 +1746,9 @@ def checkLine(self, data):
self.checkAlleleMAFFormat(data)
self.checkAlleleSpecialCases(data)
self.checkValidationColumns(data)

# Validate duplicate mutations
self.checkDuplicateMutation(data)

for col_index, col_name in enumerate(self.cols):
# validate the column if there's a function defined for it
Expand All @@ -1763,6 +1767,24 @@ def checkLine(self, data):
raise RuntimeError(('Checking function %s set an error '
'message but reported no error') %
checking_function.__name__)
def checkDuplicateMutation(self, data):
"""
Check for duplicate mutations in the MAF file based on key columns.
"""
key_columns = [
"Entrez_Gene_Id", "Chromosome", "Start_Position", "End_Position",
"Variant_Classification", "Tumor_Seq_Allele2", "HGVSp_Short", "Tumor_Sample_Barcode"
]

if all(col in self.cols for col in key_columns):
mutation_key = tuple(data[self.cols.index(col)].strip() for col in key_columns)

if mutation_key in self.seen_mutations:
log_message = f"Duplicate mutation found: {mutation_key}"
self.logger.error(log_message, extra={'line_number': self.line_number})
else:
self.seen_mutations.add(mutation_key)


# validate Tumor_Sample_Barcode value to make sure it exists in clinical sample list:
sample_id_column_index = self.cols.index('Tumor_Sample_Barcode')
Expand Down Expand Up @@ -1972,35 +1994,32 @@ def checkAlleleMAFFormat(self, data):
return True

def checkAlleleSpecialCases(self, data):
""" Check other special cases which should or should not occur in Allele Based columns
Special cases are either from unofficial vcf2maf rules or discrepancies identified. """
"""Check special cases for Allele Based columns, including NA and multiple '-' cases for indels."""

# First check if columns necessary exist in the data
# Ensure required columns are present
necessary_columns = ['Reference_Allele', 'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2', 'Variant_Type']
if set(necessary_columns).issubset(self.cols):
ref_allele = data[self.cols.index('Reference_Allele')].strip()
tumor_seq_allele1 = data[self.cols.index('Tumor_Seq_Allele1')].strip()
tumor_seq_allele2 = data[self.cols.index('Tumor_Seq_Allele2')].strip()
variant_type = data[self.cols.index('Variant_Type')].strip()

# Check if Allele Based columns are not all the same
if ref_allele == tumor_seq_allele1 and tumor_seq_allele1 == tumor_seq_allele2:
log_message = "All Values in columns Reference_Allele, Tumor_Seq_Allele1 " \
"and Tumor_Seq_Allele2 are equal."
extra_dict = {'line_number': self.line_number,
'cause': '(%s, %s, %s)' % (ref_allele, tumor_seq_allele1, tumor_seq_allele2)}
# Reject NA values for indels
if variant_type in ["INS", "DEL"] and (
ref_allele in ["NA", ""] or tumor_seq_allele1 in ["NA", ""] or tumor_seq_allele2 in ["NA", ""]
):
log_message = "Indel (INS/DEL) mutation has NA or missing values in allele columns."
extra_dict = {'line_number': self.line_number, 'cause': f'({ref_allele}, {tumor_seq_allele1}, {tumor_seq_allele2})'}
self.send_log_message(self.strict_maf_checks, log_message, extra_dict)

# In case of deletion, check when Reference_Allele is the same length as both Tumor_Seq_Allele if at least
# one of the Tumor_Seq_Alleles is a deletion ('-') otherwise a SNP
if variant_type == "DEL" and len(ref_allele) == len(tumor_seq_allele1) \
and len(ref_allele) == len(tumor_seq_allele2) and "-" not in tumor_seq_allele1 \
and "-" not in tumor_seq_allele2:
log_message = "Variant_Type indicates a deletion, Allele based columns are the same length, " \
"but Tumor_Seq_Allele columns do not contain -, indicating a SNP."
extra_dict = {'line_number': self.line_number,
'cause': '(%s, %s, %s)' % (ref_allele, tumor_seq_allele1, tumor_seq_allele2)}
self.send_log_message(self.strict_maf_checks, log_message, extra_dict)
# Reject multiple '-' in allele columns for indels
if variant_type in ["INS", "DEL"]:
for allele in [ref_allele, tumor_seq_allele1, tumor_seq_allele2]:
if allele.count('-') > 1:
log_message = "Indel (INS/DEL) mutation contains multiple '-' in allele columns."
extra_dict = {'line_number': self.line_number, 'cause': f'({ref_allele}, {tumor_seq_allele1}, {tumor_seq_allele2})'}
self.send_log_message(self.strict_maf_checks, log_message, extra_dict)
break # Exit loop after first invalid case

return True

Expand Down
Empty file modified validation/validator/validateStudies.py
100755 → 100644
Empty file.