cBioPortal · sbabyanusha · Mar 4, 2025 · Mar 19, 2025
diff --git a/validation/validator/allowed_data_types.txt b/validation/validator/allowed_data_types.txt
diff --git a/validation/validator/cbioportalImporter.py b/validation/validator/cbioportalImporter.py
diff --git a/validation/validator/cbioportal_common.py b/validation/validator/cbioportal_common.py
@@ -13,7 +13,9 @@
 from subprocess import Popen, PIPE, STDOUT
 from typing import Dict, Optional
 import dsnparse
-import MySQLdb
+import pymysql
+pymysql.install_as_MySQLdb()
+
 
 # ------------------------------------------------------------------------------
 # globals
@@ -737,59 +739,96 @@ def get_meta_file_type(meta_dictionary, logger, filename):
 
 
 def validate_types_and_id(meta_dictionary, logger, filename):
-    """Validate a genetic_alteration_type, datatype (and stable_id in some cases) against the predefined
-    allowed combinations found in ./allowed_data_types.txt
+    """Validate a genetic_alteration_type, datatype (and stable_id in some cases) 
+    against predefined allowed combinations instead of reading from allowed_data_types.txt.
     """
     result = True
-    # this validation only applies to items that have genetic_alteration_type and datatype and stable_id
-    if 'genetic_alteration_type' in meta_dictionary and 'datatype' in meta_dictionary and 'stable_id' in meta_dictionary:
-        alt_type_datatype_and_stable_id = {}
-        script_dir = os.path.dirname(__file__)
-        allowed_data_types_file_name = os.path.join(script_dir, "allowed_data_types.txt")
-        data_line_nr = 0
-        # build up map alt_type_datatype_and_stable_id:
-        with open(allowed_data_types_file_name) as allowed_data_types_file:
-            for line in allowed_data_types_file:
-                if line.startswith("#"):
-                    continue
-                data_line_nr += 1
-                # skip header, so if line is not header then process as tab separated:
-                if (data_line_nr > 1):
-                    line_cols = next(csv.reader([line], delimiter='\t'))
-                    genetic_alteration_type = line_cols[0]
-                    data_type = line_cols[1]
-                    # add to map:
-                    if (genetic_alteration_type, data_type) not in alt_type_datatype_and_stable_id:
-                        alt_type_datatype_and_stable_id[(genetic_alteration_type, data_type)] = []
-                    alt_type_datatype_and_stable_id[(genetic_alteration_type, data_type)].append(line_cols[2])
-        # init:
-        stable_id = meta_dictionary['stable_id']
-        genetic_alteration_type = meta_dictionary['genetic_alteration_type']
-        data_type = meta_dictionary['datatype']
-        # validate the genetic_alteration_type/data_type combination:
-        if (genetic_alteration_type, data_type) not in alt_type_datatype_and_stable_id:
-            # unexpected as this is already validated in get_meta_file_type
-            raise RuntimeError('Unexpected error: genetic_alteration_type and data_type combination not found in allowed_data_types.txt.',
-                               genetic_alteration_type, data_type)
-        # Check whether a wild card ('*') is set in allowed_data_types.txt for the alteration type-data type combination.
-        # For these entries the stable_id is not validated, but assumed to be checked for uniqueness by the user.
-        elif alt_type_datatype_and_stable_id[(genetic_alteration_type, data_type)][0] == "*":
-            pass
-        # validate stable_id:
-        elif stable_id not in alt_type_datatype_and_stable_id[(genetic_alteration_type, data_type)]:
-            logger.error("Invalid stable id for genetic_alteration_type '%s', "
-                         "data_type '%s'; expected one of [%s]",
-                        genetic_alteration_type,
-                        data_type,
-                        ', '.join(alt_type_datatype_and_stable_id[(genetic_alteration_type, data_type)]),
-                        extra={'filename_': filename,
-                               'cause': stable_id}
-                        )
+
+    # Define allowed combinations as a dictionary
+    allowed_combinations = {
+        ("COPY_NUMBER_ALTERATION", "DISCRETE", "cna"),
+        ("COPY_NUMBER_ALTERATION", "DISCRETE_LONG", "cna"),
+        ("COPY_NUMBER_ALTERATION", "DISCRETE", "cna_rae"),
+        ("COPY_NUMBER_ALTERATION", "DISCRETE", "cna_consensus"),
+        ("COPY_NUMBER_ALTERATION", "DISCRETE", "gistic"),
+        ("COPY_NUMBER_ALTERATION", "DISCRETE_LONG", "gistic"),
+        ("COPY_NUMBER_ALTERATION", "CONTINUOUS", "linear_CNA"),
+        ("COPY_NUMBER_ALTERATION", "LOG2-VALUE", "log2CNA"),
+        ("MUTATION_EXTENDED", "MAF", "mutations"),
+        ("MUTATION_UNCALLED", "MAF", "mutations_uncalled"),
+        ("MRNA_EXPRESSION", "CONTINUOUS", "mrna_U133"),
+        ("MRNA_EXPRESSION", "Z-SCORE", "mrna_U133_Zscores"),
+        ("MRNA_EXPRESSION", "Z-SCORE", "mrna_U133_all_sample_Zscores"),
+        ("MRNA_EXPRESSION", "Z-SCORE", "rna_seq_mrna_median_Zscores"),
+        ("MRNA_EXPRESSION", "Z-SCORE", "rna_seq_mrna_median_all_sample_Zscores"),
+        ("MRNA_EXPRESSION", "Z-SCORE", "mrna_median_Zscores"),
+        ("MRNA_EXPRESSION", "CONTINUOUS", "rna_seq_mrna"),
+        ("MRNA_EXPRESSION", "CONTINUOUS", "rna_seq_v2_mrna"),
+        ("MRNA_EXPRESSION", "Z-SCORE", "rna_seq_v2_mrna_median_Zscores"),
+        ("MRNA_EXPRESSION", "Z-SCORE", "rna_seq_v2_mrna_median_all_sample_Zscores"),
+        ("MRNA_EXPRESSION", "CONTINUOUS", "rna_seq_v2_mrna_median_normals"),
+        ("MRNA_EXPRESSION", "Z-SCORE", "rna_seq_v2_mrna_median_normals_Zscores"),
+        ("MRNA_EXPRESSION", "Z-SCORE", "rna_seq_v2_mrna_median_all_sample_ref_normal_Zscores"),
+        ("MRNA_EXPRESSION", "CONTINUOUS", "mirna"),
+        ("MRNA_EXPRESSION", "Z-SCORE", "mirna_median_Zscores"),
+        ("MRNA_EXPRESSION", "Z-SCORE", "mrna_merged_median_Zscores"),
+        ("MRNA_EXPRESSION", "CONTINUOUS", "mrna"),
+        ("MRNA_EXPRESSION", "Z-SCORE", "mrna_median_Zscores"),
+        ("MRNA_EXPRESSION", "Z-SCORE", "mrna_median_all_sample_Zscores"),       
+        ("MRNA_EXPRESSION", "Z-SCORE", "mrna_seq_fpkm_Zscores"),
+        ("MRNA_EXPRESSION", "Z-SCORE", "mrna_seq_fpkm_all_sample_Zscores"),
+        ("MRNA_EXPRESSION", "Z-SCORE", "mrna_median_all_sample_Zscores"),
+        ("MRNA_EXPRESSION", "DISCRETE", "mrna_outliers"),
+        ("MRNA_EXPRESSION", "CONTINUOUS", "mrna_seq_fpkm_capture"),
+        ("MRNA_EXPRESSION", "Z-SCORE", "mrna_seq_fpkm_capture_Zscores"),
+        ("MRNA_EXPRESSION", "Z-SCORE", "mrna_seq_fpkm_capture_all_sample_Zscores"),
+        ("MRNA_EXPRESSION", "CONTINUOUS", "rna_seq_mrna_capture"),
+        ("MRNA_EXPRESSION", "Z-SCORE", "rna_seq_mrna_capture_Zscores"),
+        ("MRNA_EXPRESSION", "Z-SCORE", "rna_seq_mrna_capture_all_sample_Zscores"),
+        ("MRNA_EXPRESSION", "Z-SCORE", "mrna_seq_fpkm_polya_all_sample_Zscores"),
+        ("MRNA_EXPRESSION", "Z-SCORE", "mrna_seq_fpkm_polya_Zscores"),
+        ("MRNA_EXPRESSION", "CONTINUOUS", "mrna_seq_fpkm_polya"),
+        ("MRNA_EXPRESSION", "CONTINUOUS", "mrna_seq_cpm"),
+        ("MRNA_EXPRESSION", "Z-SCORE", "mrna_seq_cpm_all_sample_Zscores"),
+        ("MRNA_EXPRESSION", "Z-SCORE", "mrna_seq_cpm_Zscores"),
+        ("MRNA_EXPRESSION", "CONTINUOUS", "mrna_seq_tpm"),
+        ("MRNA_EXPRESSION", "Z-SCORE", "mrna_seq_tpm_all_sample_Zscores"),
+        ("MRNA_EXPRESSION", "Z-SCORE", "mrna_seq_tpm_Zscores"),
+        ("METHYLATION", "CONTINUOUS", "methylation_hm27"),
+        ("METHYLATION", "CONTINUOUS", "methylation_hm450"),
+        ("METHYLATION", "CONTINUOUS", "methylation_epic"),
+        ("PROTEIN_LEVEL", "LOG2-VALUE", "rppa"),
+        ("PROTEIN_LEVEL", "Z-SCORE", "rppa_Zscores"),
+        ("PROTEIN_LEVEL", "CONTINUOUS", "protein_quantification"),
+        ("PROTEIN_LEVEL", "Z-SCORE", "protein_quantification_zscores"),
+        ("PROTEIN_LEVEL", "LOG2-VALUE", "protein_quantification"),
+        ("GENESET_SCORE", "GSVA-SCORE", "gsva_scores"),
+        ("GENESET_SCORE", "P-VALUE", "gsva_pvalues"),
+        ("GENERIC_ASSAY", "LIMIT-VALUE", "*"),
+        ("GENERIC_ASSAY", "BINARY", "*"),
+        ("GENERIC_ASSAY", "CATEGORICAL", "*"),
+        ("STRUCTURAL_VARIANT", "SV", "structural_variants")
+    }
+
+    # Extract required fields
+    if "genetic_alteration_type" in meta_dictionary and "datatype" in meta_dictionary and "stable_id" in meta_dictionary:
+        genetic_alteration_type = meta_dictionary["genetic_alteration_type"]
+        data_type = meta_dictionary["datatype"]
+        stable_id = meta_dictionary["stable_id"]
+
+        # Validate the combination
+        if (genetic_alteration_type, data_type, stable_id) not in allowed_combinations and \
+           (genetic_alteration_type, data_type, "*") not in allowed_combinations:
+            logger.error(
+                "Invalid genetic_alteration_type, datatype, and stable_id combination: (%s, %s, %s). "
+                "Please check your meta files.",
+                genetic_alteration_type, data_type, stable_id,
+                extra={'filename_': filename}
+            )
             result = False
 
     return result
 
-
 def parse_metadata_file(filename,
                         logger,
                         study_id=None,

diff --git a/validation/validator/metaImport.py b/validation/validator/metaImport.py
diff --git a/validation/validator/validateData.py b/validation/validator/validateData.py
@@ -92,7 +92,7 @@
 MAX_SAMPLE_STABLE_ID_LENGTH = 63
 
 # global variable that defines the invalid ID characters
-INVALID_ID_CHARACTERS = r"[^A-Za-z0-9._\(\)'+-]"
+INVALID_ID_CHARACTERS = r"[^A-Za-z0-9._\(\)',+-[:];]"
 
 # ----------------------------------------------------------------------------
 
@@ -1682,6 +1682,7 @@ class MutationsExtendedValidator(CustomDriverAnnotationValidator, CustomNamespac
     def __init__(self, *args, **kwargs):
         super(MutationsExtendedValidator, self).__init__(*args, **kwargs)
         self.extraCols = []
+        self.seen_mutations = set()  # Store seen mutations for duplicate detection
 
     def checkHeader(self, cols):
         """Validate header, requiring at least one gene id column."""
@@ -1745,6 +1746,9 @@ def checkLine(self, data):
         self.checkAlleleMAFFormat(data)
         self.checkAlleleSpecialCases(data)
         self.checkValidationColumns(data)
+
+        # Validate duplicate mutations
+        self.checkDuplicateMutation(data)
 
         for col_index, col_name in enumerate(self.cols):
             # validate the column if there's a function defined for it
@@ -1763,6 +1767,24 @@ def checkLine(self, data):
                     raise RuntimeError(('Checking function %s set an error '
                                         'message but reported no error') %
                                        checking_function.__name__)
+    def checkDuplicateMutation(self, data):
+        """
+        Check for duplicate mutations in the MAF file based on key columns.
+        """
+        key_columns = [
+            "Entrez_Gene_Id", "Chromosome", "Start_Position", "End_Position",
+            "Variant_Classification", "Tumor_Seq_Allele2", "HGVSp_Short", "Tumor_Sample_Barcode"
+        ]
+
+        if all(col in self.cols for col in key_columns):
+            mutation_key = tuple(data[self.cols.index(col)].strip() for col in key_columns)
+
+            if mutation_key in self.seen_mutations:
+                log_message = f"Duplicate mutation found: {mutation_key}"
+                self.logger.error(log_message, extra={'line_number': self.line_number})
+            else:
+                self.seen_mutations.add(mutation_key)
+
 
         # validate Tumor_Sample_Barcode value to make sure it exists in clinical sample list:
         sample_id_column_index = self.cols.index('Tumor_Sample_Barcode')
@@ -1972,35 +1994,32 @@ def checkAlleleMAFFormat(self, data):
         return True
 
     def checkAlleleSpecialCases(self, data):
-        """ Check other special cases which should or should not occur in Allele Based columns
-        Special cases are either from unofficial vcf2maf rules or discrepancies identified. """
+        """Check special cases for Allele Based columns, including NA and multiple '-' cases for indels."""
 
-        # First check if columns necessary exist in the data
+        # Ensure required columns are present
         necessary_columns = ['Reference_Allele', 'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2', 'Variant_Type']
         if set(necessary_columns).issubset(self.cols):
             ref_allele = data[self.cols.index('Reference_Allele')].strip()
             tumor_seq_allele1 = data[self.cols.index('Tumor_Seq_Allele1')].strip()
             tumor_seq_allele2 = data[self.cols.index('Tumor_Seq_Allele2')].strip()
             variant_type = data[self.cols.index('Variant_Type')].strip()
 
-            # Check if Allele Based columns are not all the same
-            if ref_allele == tumor_seq_allele1 and tumor_seq_allele1 == tumor_seq_allele2:
-                log_message = "All Values in columns Reference_Allele, Tumor_Seq_Allele1 " \
-                              "and Tumor_Seq_Allele2 are equal."
-                extra_dict = {'line_number': self.line_number,
-                              'cause': '(%s, %s, %s)' % (ref_allele, tumor_seq_allele1, tumor_seq_allele2)}
+            # Reject NA values for indels
+            if variant_type in ["INS", "DEL"] and (
+                ref_allele in ["NA", ""] or tumor_seq_allele1 in ["NA", ""] or tumor_seq_allele2 in ["NA", ""]
+            ):
+                log_message = "Indel (INS/DEL) mutation has NA or missing values in allele columns."
+                extra_dict = {'line_number': self.line_number, 'cause': f'({ref_allele}, {tumor_seq_allele1}, {tumor_seq_allele2})'}
                 self.send_log_message(self.strict_maf_checks, log_message, extra_dict)
 
-            # In case of deletion, check when Reference_Allele is the same length as both Tumor_Seq_Allele if at least
-            # one of the Tumor_Seq_Alleles is a deletion ('-') otherwise a SNP
-            if variant_type == "DEL" and len(ref_allele) == len(tumor_seq_allele1) \
-                    and len(ref_allele) == len(tumor_seq_allele2) and "-" not in tumor_seq_allele1 \
-                    and "-" not in tumor_seq_allele2:
-                log_message = "Variant_Type indicates a deletion, Allele based columns are the same length, " \
-                              "but Tumor_Seq_Allele columns do not contain -, indicating a SNP."
-                extra_dict = {'line_number': self.line_number,
-                              'cause': '(%s, %s, %s)' % (ref_allele, tumor_seq_allele1, tumor_seq_allele2)}
-                self.send_log_message(self.strict_maf_checks, log_message, extra_dict)
+            # Reject multiple '-' in allele columns for indels
+            if variant_type in ["INS", "DEL"]:
+                for allele in [ref_allele, tumor_seq_allele1, tumor_seq_allele2]:
+                    if allele.count('-') > 1:
+                        log_message = "Indel (INS/DEL) mutation contains multiple '-' in allele columns."
+                        extra_dict = {'line_number': self.line_number, 'cause': f'({ref_allele}, {tumor_seq_allele1}, {tumor_seq_allele2})'}
+                        self.send_log_message(self.strict_maf_checks, log_message, extra_dict)
+                        break  # Exit loop after first invalid case
 
         return True
 

diff --git a/validation/validator/validateStudies.py b/validation/validator/validateStudies.py