PromoterAI benchmarks (#16)

gonzalobenegas · web-flow · commit 2fcf03ee0730 · 2025-12-12T15:34:13.000-08:00
* Add promoterai benchmark dataset processing

* Add promoter ai benchmark results

* Refactor sat mut mpra
diff --git a/experiments/evals/config/config.yaml b/experiments/evals/config/config.yaml
@@ -11,9 +11,42 @@ sat_mut_mpra_promoter:
   - PKLR
   - TERT
 
+# PromoterAI benchmark datasets from GitHub
+promoterai_benchmarks:
+  promoterai_gtex_outlier: GTEx_outlier.tsv
+  promoterai_cagi5_saturation: CAGI5_saturation.tsv
+  promoterai_mpra_saturation: MPRA_saturation.tsv
+  promoterai_gtex_eqtl: GTEx_eQTL.tsv
+  promoterai_mpra_eqtl: MPRA_eQTL.tsv
+  promoterai_ukbb_proteome: UKBB_proteome.tsv
+  promoterai_gel_rna: GEL_RNA.tsv
+
+# Combined dataset groups for efficient batch inference
+combined_dataset_groups:
+  promoterai_combined:
+    datasets:
+      - promoterai_gtex_outlier
+      - promoterai_cagi5_saturation
+      - promoterai_mpra_saturation
+      - promoterai_gtex_eqtl
+      - promoterai_mpra_eqtl
+      - promoterai_ukbb_proteome
+      - promoterai_gel_rna
+  sat_mut_mpra_combined:
+    datasets:
+      - sat_mut_mpra_promoter_F9
+      - sat_mut_mpra_promoter_GP1BA
+      - sat_mut_mpra_promoter_HBB
+      - sat_mut_mpra_promoter_HBG1
+      - sat_mut_mpra_promoter_HNF4A
+      - sat_mut_mpra_promoter_LDLR
+      - sat_mut_mpra_promoter_MSMB
+      - sat_mut_mpra_promoter_PKLR
+      - sat_mut_mpra_promoter_TERT
+
 context_size: 512
-per_device_batch_size: 128
-torch_compile: False # overhead not worth it for small datasets and fast models
+per_device_batch_size: 512
+torch_compile: True # consider if overhead is worth it for small datasets and fast models
 
 # first part run for 370k steps, second part run for 130k steps
 models:
@@ -49,3 +82,8 @@ dataset_configs:
     # This applies to all promoter-specific datasets
     metrics: [Spearman]
     scorings: [absLLR.plus.score]
+
+  promoterai_benchmark:
+    # This applies to all PromoterAI benchmark datasets
+    metrics: [AUPRC]
+    scorings: [absLLR.plus.score]
diff --git a/experiments/evals/workflow/Snakefile b/experiments/evals/workflow/Snakefile
@@ -2,18 +2,31 @@ configfile: "config/config.yaml"
 
 
 def get_all_datasets():
-    """Get list of all dataset names for wildcard constraints."""
+    """Get list of all dataset names that have metrics computed (individual benchmarks only)."""
     datasets = []
     for dataset in config["dataset_configs"].keys():
         if dataset == "sat_mut_mpra_promoter":
             # Expand for each promoter
             for promoter in config["sat_mut_mpra_promoter"]:
                 datasets.append(f"sat_mut_mpra_promoter_{promoter}")
+        elif dataset == "promoterai_benchmark":
+            # Expand for each PromoterAI benchmark
+            for benchmark in config["promoterai_benchmarks"].keys():
+                datasets.append(benchmark)
         else:
             datasets.append(dataset)
     return datasets
 
 
+def get_all_datasets_including_combined():
+    """Get all datasets including combined groups for intermediate processing."""
+    datasets = get_all_datasets()
+    # Add combined dataset groups
+    for group_name in config.get("combined_dataset_groups", {}).keys():
+        datasets.append(group_name)
+    return datasets
+
+
 def get_all_metric_files():
     """Generate list of all metric files based on dataset_configs."""
     files = []
@@ -29,6 +42,15 @@ def get_all_metric_files():
                             files.append(
                                 f"results/metrics/{dataset_name}/{metric}/{model}_{scoring}.tsv"
                             )
+        elif dataset == "promoterai_benchmark":
+            # Handle promoterai_benchmark - expand for each benchmark
+            for benchmark in config["promoterai_benchmarks"].keys():
+                for metric in cfg["metrics"]:
+                    for model in config["models"].keys():
+                        for scoring in cfg["scorings"]:
+                            files.append(
+                                f"results/metrics/{benchmark}/{metric}/{model}_{scoring}.tsv"
+                            )
         else:
             # Regular datasets
             for metric in cfg["metrics"]:
@@ -63,11 +85,11 @@ include: "rules/common.smk"
 include: "rules/gnomad.smk"
 include: "rules/metrics.smk"
 include: "rules/model.smk"
+include: "rules/promoterai_benchmarks.smk"
 include: "rules/sat_mut_mpra.smk"
 include: "rules/traitgym.smk"
 
 
 rule all:
     input:
-        get_all_metric_files(),
-        get_all_correlation_files()
+        get_all_correlation_files(),
diff --git a/experiments/evals/workflow/rules/metrics.smk b/experiments/evals/workflow/rules/metrics.smk
@@ -1,46 +1,85 @@
-rule metrics_AUPRC:
-    input:
-        "results/dataset/{dataset}.parquet",
-        "results/prediction/{dataset}/{model}.parquet",
-    output:
-        "results/metrics/{dataset}/AUPRC/{model}.tsv",
-    wildcard_constraints:
-        dataset="|".join(get_all_datasets()),
-    run:
-        y_true = pd.read_parquet(input[0], columns=["label"]).label
-        y_pred = pd.read_parquet(input[1], columns=["score"]).score
-        AUPRC = average_precision_score(y_true, y_pred)
-        pd.DataFrame({"AUPRC": [AUPRC]}).to_csv(output[0], sep="\t", index=False, float_format="%.3f")
+# Metric function definitions
+def metric_auprc(y_true, y_pred):
+    """Compute Area Under Precision-Recall Curve."""
+    return average_precision_score(y_true, y_pred)
 
 
-rule metrics_AUROC:
-    input:
-        "results/dataset/{dataset}.parquet",
-        "results/prediction/{dataset}/{model}.parquet",
-    output:
-        "results/metrics/{dataset}/AUROC/{model}.tsv",
-    wildcard_constraints:
-        dataset="|".join(get_all_datasets()),
-    run:
-        y_true = pd.read_parquet(input[0], columns=["label"]).label
-        y_pred = pd.read_parquet(input[1], columns=["score"]).score
-        AUROC = roc_auc_score(y_true, y_pred)
-        pd.DataFrame({"AUROC": [AUROC]}).to_csv(output[0], sep="\t", index=False, float_format="%.3f")
+def metric_auroc(y_true, y_pred):
+    """Compute Area Under ROC Curve."""
+    return roc_auc_score(y_true, y_pred)
+
 
+def metric_spearman(y_true, y_pred):
+    """Compute Spearman correlation coefficient."""
+    return spearmanr(y_true, y_pred)[0]
 
-rule metrics_Spearman:
+
+# Metric registry - maps metric names to functions
+METRIC_FUNCTIONS = {
+    "AUPRC": metric_auprc,
+    "AUROC": metric_auroc,
+    "Spearman": metric_spearman,
+}
+
+
+def get_combined_group(dataset_name):
+    """Return combined group name if dataset belongs to one, else None."""
+    for group_name, group_config in config.get("combined_dataset_groups", {}).items():
+        if dataset_name in group_config["datasets"]:
+            return group_name
+    return None
+
+
+def get_dataset_input(wildcards):
+    """Get dataset input path - combined or individual."""
+    combined_group = get_combined_group(wildcards.dataset)
+    if combined_group:
+        return f"results/dataset/{combined_group}.parquet"
+    else:
+        return f"results/dataset/{wildcards.dataset}.parquet"
+
+
+def get_prediction_input(wildcards):
+    """Get prediction input path - combined or individual."""
+    combined_group = get_combined_group(wildcards.dataset)
+    if combined_group:
+        return f"results/prediction/{combined_group}/{wildcards.model}.parquet"
+    else:
+        return f"results/prediction/{wildcards.dataset}/{wildcards.model}.parquet"
+
+
+rule metrics:
+    """Unified metrics rule - handles AUPRC, AUROC, Spearman for all datasets."""
     input:
-        "results/dataset/{dataset}.parquet",
-        "results/prediction/{dataset}/{model}.parquet",
+        dataset=get_dataset_input,
+        prediction=get_prediction_input,
     output:
-        "results/metrics/{dataset}/Spearman/{model}.tsv",
+        "results/metrics/{dataset}/{metric}/{model}.tsv",
     wildcard_constraints:
         dataset="|".join(get_all_datasets()),
     run:
-        y_true = pd.read_parquet(input[0], columns=["label"]).label
-        y_pred = pd.read_parquet(input[1], columns=["score"]).score
-        Spearman = spearmanr(y_true, y_pred)[0]
-        pd.DataFrame({"Spearman": [Spearman]}).to_csv(output[0], sep="\t", index=False, float_format="%.3f")
+        # Load data
+        df_dataset = pd.read_parquet(input.dataset)
+        df_pred = pd.read_parquet(input.prediction)
+
+        # Filter to specific benchmark if using combined dataset (positional filtering)
+        if 'dataset' in df_dataset.columns:
+            mask = df_dataset['dataset'] == wildcards.dataset
+            df_dataset = df_dataset[mask]
+            df_pred = df_pred[mask]  # Apply same positional mask
+
+        # Extract labels and scores
+        y_true = df_dataset["label"]
+        y_pred = df_pred["score"]
+
+        # Compute metric using registry
+        metric_func = METRIC_FUNCTIONS[wildcards.metric]
+        value = metric_func(y_true, y_pred)
+
+        # Save result
+        pd.DataFrame({wildcards.metric: [value]}).to_csv(
+            output[0], sep="\t", index=False, float_format="%.3f"
+        )
 
 
 rule aggregate_metrics:
diff --git a/experiments/evals/workflow/rules/model.smk b/experiments/evals/workflow/rules/model.smk
@@ -21,7 +21,7 @@ rule model_llr:
     output:
         "results/features/{dataset}/{model}_LLR.parquet",
     wildcard_constraints:
-        dataset="|".join(get_all_datasets()),
+        dataset="|".join(get_all_datasets_including_combined()),
         model="|".join(config["models"].keys()),
     threads:
         workflow.cores
@@ -57,7 +57,7 @@ rule model_abs_llr:
     output:
         "results/features/{dataset}/{model}_absLLR.parquet",
     wildcard_constraints:
-        dataset="|".join(get_all_datasets()),
+        dataset="|".join(get_all_datasets_including_combined()),
         model="|".join(config["models"].keys()),
     run:
         df = pd.read_parquet(input[0])
diff --git a/experiments/evals/workflow/rules/promoterai_benchmarks.smk b/experiments/evals/workflow/rules/promoterai_benchmarks.smk
@@ -0,0 +1,45 @@
+# PromoterAI benchmark datasets from GitHub
+# Downloads and processes all benchmarks from: https://github.com/Illumina/PromoterAI/tree/master/data/benchmark
+
+rule promoterai_benchmark:
+    output:
+        "results/dataset/{dataset}.parquet",
+    wildcard_constraints:
+        dataset="|".join(config["promoterai_benchmarks"].keys()),
+    params:
+        filename=lambda wildcards: config["promoterai_benchmarks"][wildcards.dataset],
+    run:
+        url = f"https://raw.githubusercontent.com/Illumina/PromoterAI/master/data/benchmark/{params.filename}"
+        V = pd.read_csv(url, sep="\t")
+        V["chrom"] = V["chrom"].str.replace("^chr", "", regex=True)
+        # Group by coordinates (variants near multiple genes)
+        # Label is True if consequence is not "none" for ANY gene
+        V_grouped = V.groupby(COORDINATES, as_index=False).agg({
+            "consequence": lambda x: (x != "none").any(),
+        })
+        V_grouped = V_grouped.rename(columns={"consequence": "label"})
+        V_grouped.to_parquet(output[0], index=False)
+
+
+rule combine_promoterai_datasets:
+    """Combine all promoterai benchmarks into a single dataset for efficient batch inference."""
+    input:
+        lambda wildcards: expand(
+            "results/dataset/{dataset}.parquet",
+            dataset=config["combined_dataset_groups"][wildcards.combined_group]["datasets"]
+        )
+    output:
+        "results/dataset/{combined_group}.parquet"
+    wildcard_constraints:
+        combined_group="promoterai_combined"
+    run:
+        datasets = config["combined_dataset_groups"][wildcards.combined_group]["datasets"]
+        dfs = []
+        for dataset_name, dataset_path in zip(datasets, input):
+            df = pd.read_parquet(dataset_path)
+            df["dataset"] = dataset_name  # Add dataset identifier column
+            dfs.append(df)
+
+        # Concatenate all datasets
+        combined = pd.concat(dfs, ignore_index=True)
+        combined.to_parquet(output[0], index=False)
diff --git a/experiments/evals/workflow/rules/sat_mut_mpra.smk b/experiments/evals/workflow/rules/sat_mut_mpra.smk
@@ -6,3 +6,27 @@ rule sat_mut_mpra_promoter_dataset:
         V["label"] = V["label"].abs()  # abs(LFC)
         for promoter, path in zip(config["sat_mut_mpra_promoter"], output):
             V[V["element"] == promoter].to_parquet(path, index=False)
+
+
+rule combine_sat_mut_mpra_datasets:
+    """Combine all sat_mut_mpra promoter datasets into a single dataset for efficient batch inference."""
+    input:
+        lambda wildcards: expand(
+            "results/dataset/{dataset}.parquet",
+            dataset=config["combined_dataset_groups"][wildcards.combined_group]["datasets"]
+        )
+    output:
+        "results/dataset/{combined_group}.parquet"
+    wildcard_constraints:
+        combined_group="sat_mut_mpra_combined"
+    run:
+        datasets = config["combined_dataset_groups"][wildcards.combined_group]["datasets"]
+        dfs = []
+        for dataset_name, dataset_path in zip(datasets, input):
+            df = pd.read_parquet(dataset_path)
+            df["dataset"] = dataset_name  # Add dataset identifier column
+            dfs.append(df)
+
+        # Concatenate all datasets
+        combined = pd.concat(dfs, ignore_index=True)
+        combined.to_parquet(output[0], index=False)