forecastingresearch · simas-fri · Sep 19, 2025 · Sep 19, 2025 · Sep 19, 2025 · Sep 19, 2025
diff --git a/src/ranking_sim.py b/src/ranking_sim.py
@@ -320,63 +320,54 @@ def simulate_random_sampling(df, n_questions_per_model, ref_model="Always 0.5"):
     """Simulate a dataset by drawing a n_questions_per_model random questions
     (sampled with replacement) from the full sample of questions. ref_model
     answers all questions"""
-    # Get parameters
-    models = df["model"].unique()
+    # Extract variables
     questions = df["question_id"].unique()
-    n_models = len(models)
-    n_questions = len(questions)
+    models = df["model"].unique()
 
     # Check if ref_model exists
-    if ref_model is not None and ref_model in models:
-        # Find ref_model index
-        ref_model_idx = np.where(models == ref_model)[0][0]
-
-        # Create indices for all OTHER models
-        other_model_indices = np.arange(n_models)
-        other_model_indices = other_model_indices[other_model_indices != ref_model_idx]
-
-        # Sample for other models
-        all_model_indices = np.repeat(other_model_indices, n_questions_per_model)
-        all_question_indices = np.concatenate(
-            [
-                np.random.choice(n_questions, size=n_questions_per_model, replace=True)
-                for _ in range(len(other_model_indices))
-            ]
-        )
-
-        # Append reference model with ALL questions at the end
-        all_model_indices = np.concatenate(
-            [all_model_indices, np.repeat(ref_model_idx, n_questions)]
-        )
-        all_question_indices = np.concatenate(
-            [all_question_indices, np.arange(n_questions)]
-        )
-    else:
+    if ref_model is None or ref_model not in models:
         raise ValueError("Reference model not provided.")
 
-    # Convert indices to actual model/question values
-    sampled_models = models[all_model_indices]
-    sampled_questions = questions[all_question_indices]
-
-    # Create DataFrame of sampled (model, question) pairs
-    # Include a sample_id to handle duplicates from sampling with replacement
-    df_samples = pd.DataFrame(
-        {
-            "model": sampled_models,
-            "question_id": sampled_questions,
-            "sample_id": np.arange(len(sampled_models)),  # Unique ID for each sample
-        }
+    other_models = [model for model in models if model != ref_model]
+    n_other_models = len(other_models)
+
+    # Draw questions for non-reference models
+    df_samples = pd.DataFrame()
+    df_samples["model"] = np.repeat(other_models, n_questions_per_model)
+    df_samples["question_id"] = np.random.choice(
+        questions, size=n_questions_per_model * n_other_models, replace=True
+    )
+
+    # Calculate number of occurences of question_id for a given model.
+    # This is for getting a unique primary key for simulated questions.
+    # This approach treats the same question_id (from the original dataset)
+    # resampled k times as k different questions (i.e., k different
+    # sim_question_id's)
+    df_samples = df_samples.sort_values(["model", "question_id"]).reset_index(drop=True)
+    df_samples["occ_question_id"] = (
+        df_samples.groupby(["model", "question_id"]).cumcount() + 1
     )
 
-    # Single merge operation to get all data
-    # The sample_id ensures we keep duplicates when sampling with replacement
+    # Create a unique primary key for simulated questions
+    df_samples["sim_question_id"] = (
+        df_samples["question_id"] + "-" + df_samples["occ_question_id"].astype(str)
+    )
+
+    # Add reference model
+    df_temp = df_samples[["question_id", "sim_question_id"]].copy().drop_duplicates()
+    df_temp["model"] = ref_model
+    df_samples = pd.concat([df_samples, df_temp], ignore_index=True)
+
+    # Get data on forecasts and realizations from the original dataset
     df_results = df_samples.merge(
         df[["model", "question_id", "forecast", "resolved_to", "question_type"]],
         on=["model", "question_id"],
         how="left",
     )
 
     # Clean up
+    df_results["question_id"] = df_results["sim_question_id"]
+    df_results = df_results.drop(["occ_question_id", "sim_question_id"], axis=1)
     df_results = df_results.reset_index(drop=True)
 
     return df_results
@@ -407,11 +398,15 @@ def fixed_dataset_market_question_sample(df, n):
     # The remainder
     n_market = n - n_dataset
 
+    # Since sampling is done with replacement, we need at least
+    # 1 market and at least 1 dataset questions, and the required
+    # number of market questions to sample should be >= 1
     if (
         n_dataset_horizon < 1
         or n_market < 1
-        or n_dataset > len(df["question_type"] == "dataset")
-        or n_market > len(df["question_type"] == "market")
+        or len(dataset_groups.values[0]) < 1  # At least one dataset question exists
+        or len(df[df["question_type"] == "market"])
+        < 1  # At least one market question exists
     ):
         raise ValueError(
             f"`fixed_dataset_market_question_sample()` needs a bigger `n`. It was "
@@ -667,6 +662,28 @@ def simulate_round_based(
     # Create DataFrame from all samples
     df_samples = pd.DataFrame(data_rows)
 
+    # Create a unique primary key for simulated questions.
+    # A question with the same question_id (from the original dataset)
+    # that is drawn in round R and R + 1 is treated as a different question
+    # (i.e., will have different sim_question_id's). Also,
+    # if the same question is drawn k > 1 times in the same round R,
+    # it will also get different sim_question_id's.
+    df_samples = df_samples.sort_values(
+        ["model", "round_id", "question_id"], ascending=True
+    ).reset_index(drop=True)
+    df_samples["occ_question_id"] = (
+        df_samples.groupby(["model", "round_id", "question_id"]).cumcount() + 1
+    )
+
+    # Create unique sim_question_id for within-round duplicates
+    df_samples["sim_question_id"] = (
+        df_samples["question_id"]
+        + "-R"
+        + df_samples["round_id"].astype(str)
+        + "-"
+        + df_samples["occ_question_id"].astype(str)
+    )
+
     # Merge with original data to get forecasts and outcomes
     # round_id ensures uniqueness during merge
     df_results = df_samples.merge(
@@ -675,6 +692,11 @@ def simulate_round_based(
         how="left",
     )
 
+    # Clean up
+    df_results["question_id"] = df_results["sim_question_id"]
+    df_results = df_results.drop(["occ_question_id", "sim_question_id"], axis=1)
+    df_results = df_results.reset_index(drop=True)
+
     return df_results
 
 

diff --git a/tests/test_ranking_sim.py b/tests/test_ranking_sim.py
@@ -513,7 +513,7 @@ def test_fixed_dataset_market_question_sample_success(
             assert all(f"{source}_{qid}_{h}" in questions for h in dataset_horizons)
 
 
-@pytest.mark.parametrize("N", [99, 3, 0])
+@pytest.mark.parametrize("N", [3, 0])
 def test_fixed_dataset_market_question_sample_errors(df_for_sampling, N):
     """Test error is thrown for bad values of `N`."""
     with pytest.raises(ValueError):
@@ -1266,10 +1266,9 @@ def test_simulate_random_sampling():
     # Test with 20% overlap
     df_sim = simulate_random_sampling(df, n_questions_per_model=2, ref_model="A")
 
-    # Check that ref model A has all questions
+    # Check that ref model A answers all questions
     a_questions = df_sim[df_sim["model"] == "A"]["question_id"].unique()
-    assert len(a_questions) == 3  # All 3 questions
-    assert set(a_questions) == {"q1", "q2", "q3"}
+    assert set(a_questions) == set(df_sim["question_id"].unique())
 
     # Check that other models have fewer questions
     b_questions = df_sim[df_sim["model"] == "B"]["question_id"].values
@@ -1278,22 +1277,19 @@ def test_simulate_random_sampling():
     assert len(c_questions) == 2
 
     # Check that all data is preserved correctly
+    df_sim["orig_question_id"] = (
+        df_sim["question_id"].str.rsplit("-", n=1).str[0]
+    )  # Get the corresponding original question_id
     for _, row in df_sim.iterrows():
         # Find corresponding row in original
-        mask = (df["model"] == row["model"]) & (df["question_id"] == row["question_id"])
+        mask = (df["model"] == row["model"]) & (
+            df["question_id"] == row["orig_question_id"]
+        )
         orig_row = df[mask].iloc[0]
         assert row["forecast"] == orig_row["forecast"]
         assert row["resolved_to"] == orig_row["resolved_to"]
         assert row["question_type"] == orig_row["question_type"]
 
-    # Test with n_questions_per_model=3
-    df_sim_full = simulate_random_sampling(df, n_questions_per_model=3, ref_model="A")
-
-    # Each model should have all 3 questions
-    for model in ["A", "B", "C"]:
-        model_samples = len(df_sim_full[df_sim_full["model"] == model])
-        assert model_samples == 3
-
     # Test that ref_model must exist
     with pytest.raises(ValueError, match="Reference model not provided"):
         simulate_random_sampling(df, n_questions_per_model=2, ref_model="NonExistent")
@@ -1451,7 +1447,7 @@ def test_evaluate_ranking_methods_oracle():
         # Oracle should almost always be ranked #1
         top1_retention = method_results["Top-1 Retention"].mean()
         assert (
-            top1_retention > 0.90
+            top1_retention > 0.85
         ), f"Oracle should almost always be top-1 \
              with {method}, got {top1_retention}"
 
@@ -2043,7 +2039,7 @@ def test_simulation_regression_results():
     - n_questions_per_model: 125
     - dataset_weight: 0.5
     - simulation_method: "random_sampling"
-    - ref_model = "GPT-4 (zero shot)"
+    - ref_model = "Naive Forecaster"
     """
     np.random.seed(20250527)
 
@@ -2077,10 +2073,10 @@ def test_simulation_regression_results():
 
     # Expected results (from known good run)
     expected_results = {
-        "Brier": {"Spearman": 0.726376, "Top-20 Retention": 0.503},
-        "Diff-Adj. Brier": {"Spearman": 0.813342, "Top-20 Retention": 0.582},
+        "Brier": {"Spearman": 0.726328, "Top-20 Retention": 0.503},
+        "Diff-Adj. Brier": {"Spearman": 0.812766, "Top-20 Retention": 0.589},
         "BSS": {"Spearman": 0.134529, "Top-20 Retention": 0.321},
-        "Peer Score": {"Spearman": 0.813029, "Top-20 Retention": 0.590},
+        "Peer Score": {"Spearman": 0.811932, "Top-20 Retention": 0.577},
     }
 
     # Check results
@@ -2350,7 +2346,12 @@ def test_difficulty_drift_hard_share_grows():
             skill_temperature=None,
             difficulty_temperature=lambda r: beta[r],
         )
-        hard = (sim["question_id"] == "hard").groupby(sim["round_id"]).any()
+        hard = (
+            sim["question_id"]
+            .apply(lambda x: x[0:4] == "hard")
+            .groupby(sim["round_id"])
+            .any()
+        )
         hard_counts += hard.reindex(range(R), fill_value=False).astype(int).values
     hard_share = hard_counts / REP
 
@@ -2414,9 +2415,9 @@ def test_simulation_regression_round_based_results():
     # Expected results (from known good run)
     expected_results = {
         "Brier": {"Spearman": 0.694564, "Top-20 Retention": 0.476},
-        "Diff-Adj. Brier": {"Spearman": 0.771530, "Top-20 Retention": 0.557},
+        "Diff-Adj. Brier": {"Spearman": 0.770730, "Top-20 Retention": 0.560},
         "BSS": {"Spearman": 0.145300, "Top-20 Retention": 0.319},
-        "Peer Score": {"Spearman": 0.770672, "Top-20 Retention": 0.564},
+        "Peer Score": {"Spearman": 0.768716, "Top-20 Retention": 0.561},
     }
 
     # Check results
@@ -2469,3 +2470,112 @@ def test_persistence_60_percent():
     r1 = set(sim.loc[sim.round_id == 1, "model"]) - {"Always 0.5"}
 
     assert len(r0 & r1) == np.floor(0.6 * len(r0))  # 60 % persistence
+
+
+@pytest.mark.parametrize("simulation_type", ["random", "round_based"])
+@pytest.mark.parametrize(
+    "simulation_kwargs",
+    [
+        {"n_questions_per_model": 10},  # For random sampling only
+        {
+            "n_rounds": 3,
+            "questions_per_round": 5,
+            "models_per_round_mean": 3,
+        },  # For round_based only
+        {
+            "n_rounds": 2,
+            "questions_per_round": 8,
+            "models_per_round_mean": 4,
+            "model_persistence": 0.5,
+        },  # Round_based with persistence
+        {
+            "n_rounds": 4,
+            "questions_per_round": 3,
+            "models_per_round_mean": 2,
+            "fixed_models_per_round": True,
+        },  # Round_based fixed models
+    ],
+)
+def test_no_duplicates_at_model_question_level(simulation_type, simulation_kwargs):
+    """Test that there are no duplicate [model, question_id] combinations in
+    simulation results.
+
+    This ensures that each model answers each question at most once, which is critical
+    for proper ranking calculations.
+    """
+    # Create test dataset with sufficient questions and models
+    models = ["RefModel", "ModelA", "ModelB", "ModelC", "ModelD", "ModelE"]
+    question_ids = [f"q{i}" for i in range(20)]
+
+    data = []
+    for model in models:
+        for question_id in question_ids:
+            data.append(
+                {
+                    "model": model,
+                    "question_id": question_id,
+                    "forecast": np.random.uniform(0.1, 0.9),
+                    "resolved_to": np.random.choice([0, 1]),
+                    "question_type": "dataset",
+                }
+            )
+
+    df = pd.DataFrame(data)
+
+    # Set seed for reproducibility
+    np.random.seed(42)
+
+    # Run appropriate simulation based on type
+    if simulation_type == "random":
+        # Only use kwargs relevant to random sampling
+        relevant_kwargs = {
+            k: v for k, v in simulation_kwargs.items() if k in ["n_questions_per_model"]
+        }
+        if not relevant_kwargs:
+            pytest.skip("No relevant kwargs for random sampling")
+
+        df_sim = simulate_random_sampling(df, ref_model="RefModel", **relevant_kwargs)
+
+    elif simulation_type == "round_based":
+        # Only use kwargs relevant to round-based sampling
+        relevant_kwargs = {
+            k: v
+            for k, v in simulation_kwargs.items()
+            if k
+            in [
+                "n_rounds",
+                "questions_per_round",
+                "models_per_round_mean",
+                "model_persistence",
+                "fixed_models_per_round",
+            ]
+        }
+        if "n_rounds" not in relevant_kwargs:
+            pytest.skip("No relevant kwargs for round-based sampling")
+
+        df_sim = simulate_round_based(df, ref_model="RefModel", **relevant_kwargs)
+
+    # Check for duplicates at [model, question_id] level
+    duplicate_check = df_sim.groupby(["model", "question_id"]).size()
+    duplicates = duplicate_check[duplicate_check > 1]
+
+    if len(duplicates) > 0:
+        print(f"\nFound duplicates in {simulation_type} simulation:")
+        print(f"Simulation kwargs: {simulation_kwargs}")
+        print("Duplicate combinations:")
+        for (model, question_id), count in duplicates.items():
+            print(f"  Model '{model}', Question '{question_id}': {count} occurrences")
+
+        # Also show a sample of the problematic data
+        sample_duplicate = duplicates.index[0]
+        model, question_id = sample_duplicate
+        duplicate_rows = df_sim[
+            (df_sim["model"] == model) & (df_sim["question_id"] == question_id)
+        ]
+        print(f"\nSample duplicate rows for Model '{model}', Question '{question_id}':")
+        print(duplicate_rows.to_string())
+
+    # The assertion: no duplicates should exist
+    assert (
+        len(duplicates) == 0
+    ), f"Found {len(duplicates)} duplicate [model, question_id] combinations"