diff --git a/src/ranking_sim.py b/src/ranking_sim.py index 943372e..0564369 100644 --- a/src/ranking_sim.py +++ b/src/ranking_sim.py @@ -320,56 +320,45 @@ def simulate_random_sampling(df, n_questions_per_model, ref_model="Always 0.5"): """Simulate a dataset by drawing a n_questions_per_model random questions (sampled with replacement) from the full sample of questions. ref_model answers all questions""" - # Get parameters - models = df["model"].unique() + # Extract variables questions = df["question_id"].unique() - n_models = len(models) - n_questions = len(questions) + models = df["model"].unique() # Check if ref_model exists - if ref_model is not None and ref_model in models: - # Find ref_model index - ref_model_idx = np.where(models == ref_model)[0][0] - - # Create indices for all OTHER models - other_model_indices = np.arange(n_models) - other_model_indices = other_model_indices[other_model_indices != ref_model_idx] - - # Sample for other models - all_model_indices = np.repeat(other_model_indices, n_questions_per_model) - all_question_indices = np.concatenate( - [ - np.random.choice(n_questions, size=n_questions_per_model, replace=True) - for _ in range(len(other_model_indices)) - ] - ) - - # Append reference model with ALL questions at the end - all_model_indices = np.concatenate( - [all_model_indices, np.repeat(ref_model_idx, n_questions)] - ) - all_question_indices = np.concatenate( - [all_question_indices, np.arange(n_questions)] - ) - else: + if ref_model is None or ref_model not in models: raise ValueError("Reference model not provided.") - # Convert indices to actual model/question values - sampled_models = models[all_model_indices] - sampled_questions = questions[all_question_indices] - - # Create DataFrame of sampled (model, question) pairs - # Include a sample_id to handle duplicates from sampling with replacement - df_samples = pd.DataFrame( - { - "model": sampled_models, - "question_id": sampled_questions, - "sample_id": np.arange(len(sampled_models)), # Unique ID for each sample - } + other_models = [model for model in models if model != ref_model] + n_other_models = len(other_models) + + # Draw questions for non-reference models + df_samples = pd.DataFrame() + df_samples["model"] = np.repeat(other_models, n_questions_per_model) + df_samples["question_id"] = np.random.choice( + questions, size=n_questions_per_model * n_other_models, replace=True + ) + + # Calculate number of occurences of question_id for a given model. + # This is for getting a unique primary key for simulated questions. + # This approach treats the same question_id (from the original dataset) + # resampled k times as k different questions (i.e., k different + # sim_question_id's) + df_samples = df_samples.sort_values(["model", "question_id"]).reset_index(drop=True) + df_samples["occ_question_id"] = ( + df_samples.groupby(["model", "question_id"]).cumcount() + 1 ) - # Single merge operation to get all data - # The sample_id ensures we keep duplicates when sampling with replacement + # Create a unique primary key for simulated questions + df_samples["sim_question_id"] = ( + df_samples["question_id"] + "-" + df_samples["occ_question_id"].astype(str) + ) + + # Add reference model + df_temp = df_samples[["question_id", "sim_question_id"]].copy().drop_duplicates() + df_temp["model"] = ref_model + df_samples = pd.concat([df_samples, df_temp], ignore_index=True) + + # Get data on forecasts and realizations from the original dataset df_results = df_samples.merge( df[["model", "question_id", "forecast", "resolved_to", "question_type"]], on=["model", "question_id"], @@ -377,6 +366,8 @@ def simulate_random_sampling(df, n_questions_per_model, ref_model="Always 0.5"): ) # Clean up + df_results["question_id"] = df_results["sim_question_id"] + df_results = df_results.drop(["occ_question_id", "sim_question_id"], axis=1) df_results = df_results.reset_index(drop=True) return df_results @@ -407,11 +398,15 @@ def fixed_dataset_market_question_sample(df, n): # The remainder n_market = n - n_dataset + # Since sampling is done with replacement, we need at least + # 1 market and at least 1 dataset questions, and the required + # number of market questions to sample should be >= 1 if ( n_dataset_horizon < 1 or n_market < 1 - or n_dataset > len(df["question_type"] == "dataset") - or n_market > len(df["question_type"] == "market") + or len(dataset_groups.values[0]) < 1 # At least one dataset question exists + or len(df[df["question_type"] == "market"]) + < 1 # At least one market question exists ): raise ValueError( f"`fixed_dataset_market_question_sample()` needs a bigger `n`. It was " @@ -667,6 +662,28 @@ def simulate_round_based( # Create DataFrame from all samples df_samples = pd.DataFrame(data_rows) + # Create a unique primary key for simulated questions. + # A question with the same question_id (from the original dataset) + # that is drawn in round R and R + 1 is treated as a different question + # (i.e., will have different sim_question_id's). Also, + # if the same question is drawn k > 1 times in the same round R, + # it will also get different sim_question_id's. + df_samples = df_samples.sort_values( + ["model", "round_id", "question_id"], ascending=True + ).reset_index(drop=True) + df_samples["occ_question_id"] = ( + df_samples.groupby(["model", "round_id", "question_id"]).cumcount() + 1 + ) + + # Create unique sim_question_id for within-round duplicates + df_samples["sim_question_id"] = ( + df_samples["question_id"] + + "-R" + + df_samples["round_id"].astype(str) + + "-" + + df_samples["occ_question_id"].astype(str) + ) + # Merge with original data to get forecasts and outcomes # round_id ensures uniqueness during merge df_results = df_samples.merge( @@ -675,6 +692,11 @@ def simulate_round_based( how="left", ) + # Clean up + df_results["question_id"] = df_results["sim_question_id"] + df_results = df_results.drop(["occ_question_id", "sim_question_id"], axis=1) + df_results = df_results.reset_index(drop=True) + return df_results diff --git a/tests/test_ranking_sim.py b/tests/test_ranking_sim.py index e8f8b28..78355c4 100644 --- a/tests/test_ranking_sim.py +++ b/tests/test_ranking_sim.py @@ -513,7 +513,7 @@ def test_fixed_dataset_market_question_sample_success( assert all(f"{source}_{qid}_{h}" in questions for h in dataset_horizons) -@pytest.mark.parametrize("N", [99, 3, 0]) +@pytest.mark.parametrize("N", [3, 0]) def test_fixed_dataset_market_question_sample_errors(df_for_sampling, N): """Test error is thrown for bad values of `N`.""" with pytest.raises(ValueError): @@ -1266,10 +1266,9 @@ def test_simulate_random_sampling(): # Test with 20% overlap df_sim = simulate_random_sampling(df, n_questions_per_model=2, ref_model="A") - # Check that ref model A has all questions + # Check that ref model A answers all questions a_questions = df_sim[df_sim["model"] == "A"]["question_id"].unique() - assert len(a_questions) == 3 # All 3 questions - assert set(a_questions) == {"q1", "q2", "q3"} + assert set(a_questions) == set(df_sim["question_id"].unique()) # Check that other models have fewer questions b_questions = df_sim[df_sim["model"] == "B"]["question_id"].values @@ -1278,22 +1277,19 @@ def test_simulate_random_sampling(): assert len(c_questions) == 2 # Check that all data is preserved correctly + df_sim["orig_question_id"] = ( + df_sim["question_id"].str.rsplit("-", n=1).str[0] + ) # Get the corresponding original question_id for _, row in df_sim.iterrows(): # Find corresponding row in original - mask = (df["model"] == row["model"]) & (df["question_id"] == row["question_id"]) + mask = (df["model"] == row["model"]) & ( + df["question_id"] == row["orig_question_id"] + ) orig_row = df[mask].iloc[0] assert row["forecast"] == orig_row["forecast"] assert row["resolved_to"] == orig_row["resolved_to"] assert row["question_type"] == orig_row["question_type"] - # Test with n_questions_per_model=3 - df_sim_full = simulate_random_sampling(df, n_questions_per_model=3, ref_model="A") - - # Each model should have all 3 questions - for model in ["A", "B", "C"]: - model_samples = len(df_sim_full[df_sim_full["model"] == model]) - assert model_samples == 3 - # Test that ref_model must exist with pytest.raises(ValueError, match="Reference model not provided"): simulate_random_sampling(df, n_questions_per_model=2, ref_model="NonExistent") @@ -1451,7 +1447,7 @@ def test_evaluate_ranking_methods_oracle(): # Oracle should almost always be ranked #1 top1_retention = method_results["Top-1 Retention"].mean() assert ( - top1_retention > 0.90 + top1_retention > 0.85 ), f"Oracle should almost always be top-1 \ with {method}, got {top1_retention}" @@ -2043,7 +2039,7 @@ def test_simulation_regression_results(): - n_questions_per_model: 125 - dataset_weight: 0.5 - simulation_method: "random_sampling" - - ref_model = "GPT-4 (zero shot)" + - ref_model = "Naive Forecaster" """ np.random.seed(20250527) @@ -2077,10 +2073,10 @@ def test_simulation_regression_results(): # Expected results (from known good run) expected_results = { - "Brier": {"Spearman": 0.726376, "Top-20 Retention": 0.503}, - "Diff-Adj. Brier": {"Spearman": 0.813342, "Top-20 Retention": 0.582}, + "Brier": {"Spearman": 0.726328, "Top-20 Retention": 0.503}, + "Diff-Adj. Brier": {"Spearman": 0.812766, "Top-20 Retention": 0.589}, "BSS": {"Spearman": 0.134529, "Top-20 Retention": 0.321}, - "Peer Score": {"Spearman": 0.813029, "Top-20 Retention": 0.590}, + "Peer Score": {"Spearman": 0.811932, "Top-20 Retention": 0.577}, } # Check results @@ -2350,7 +2346,12 @@ def test_difficulty_drift_hard_share_grows(): skill_temperature=None, difficulty_temperature=lambda r: beta[r], ) - hard = (sim["question_id"] == "hard").groupby(sim["round_id"]).any() + hard = ( + sim["question_id"] + .apply(lambda x: x[0:4] == "hard") + .groupby(sim["round_id"]) + .any() + ) hard_counts += hard.reindex(range(R), fill_value=False).astype(int).values hard_share = hard_counts / REP @@ -2414,9 +2415,9 @@ def test_simulation_regression_round_based_results(): # Expected results (from known good run) expected_results = { "Brier": {"Spearman": 0.694564, "Top-20 Retention": 0.476}, - "Diff-Adj. Brier": {"Spearman": 0.771530, "Top-20 Retention": 0.557}, + "Diff-Adj. Brier": {"Spearman": 0.770730, "Top-20 Retention": 0.560}, "BSS": {"Spearman": 0.145300, "Top-20 Retention": 0.319}, - "Peer Score": {"Spearman": 0.770672, "Top-20 Retention": 0.564}, + "Peer Score": {"Spearman": 0.768716, "Top-20 Retention": 0.561}, } # Check results @@ -2469,3 +2470,112 @@ def test_persistence_60_percent(): r1 = set(sim.loc[sim.round_id == 1, "model"]) - {"Always 0.5"} assert len(r0 & r1) == np.floor(0.6 * len(r0)) # 60 % persistence + + +@pytest.mark.parametrize("simulation_type", ["random", "round_based"]) +@pytest.mark.parametrize( + "simulation_kwargs", + [ + {"n_questions_per_model": 10}, # For random sampling only + { + "n_rounds": 3, + "questions_per_round": 5, + "models_per_round_mean": 3, + }, # For round_based only + { + "n_rounds": 2, + "questions_per_round": 8, + "models_per_round_mean": 4, + "model_persistence": 0.5, + }, # Round_based with persistence + { + "n_rounds": 4, + "questions_per_round": 3, + "models_per_round_mean": 2, + "fixed_models_per_round": True, + }, # Round_based fixed models + ], +) +def test_no_duplicates_at_model_question_level(simulation_type, simulation_kwargs): + """Test that there are no duplicate [model, question_id] combinations in + simulation results. + + This ensures that each model answers each question at most once, which is critical + for proper ranking calculations. + """ + # Create test dataset with sufficient questions and models + models = ["RefModel", "ModelA", "ModelB", "ModelC", "ModelD", "ModelE"] + question_ids = [f"q{i}" for i in range(20)] + + data = [] + for model in models: + for question_id in question_ids: + data.append( + { + "model": model, + "question_id": question_id, + "forecast": np.random.uniform(0.1, 0.9), + "resolved_to": np.random.choice([0, 1]), + "question_type": "dataset", + } + ) + + df = pd.DataFrame(data) + + # Set seed for reproducibility + np.random.seed(42) + + # Run appropriate simulation based on type + if simulation_type == "random": + # Only use kwargs relevant to random sampling + relevant_kwargs = { + k: v for k, v in simulation_kwargs.items() if k in ["n_questions_per_model"] + } + if not relevant_kwargs: + pytest.skip("No relevant kwargs for random sampling") + + df_sim = simulate_random_sampling(df, ref_model="RefModel", **relevant_kwargs) + + elif simulation_type == "round_based": + # Only use kwargs relevant to round-based sampling + relevant_kwargs = { + k: v + for k, v in simulation_kwargs.items() + if k + in [ + "n_rounds", + "questions_per_round", + "models_per_round_mean", + "model_persistence", + "fixed_models_per_round", + ] + } + if "n_rounds" not in relevant_kwargs: + pytest.skip("No relevant kwargs for round-based sampling") + + df_sim = simulate_round_based(df, ref_model="RefModel", **relevant_kwargs) + + # Check for duplicates at [model, question_id] level + duplicate_check = df_sim.groupby(["model", "question_id"]).size() + duplicates = duplicate_check[duplicate_check > 1] + + if len(duplicates) > 0: + print(f"\nFound duplicates in {simulation_type} simulation:") + print(f"Simulation kwargs: {simulation_kwargs}") + print("Duplicate combinations:") + for (model, question_id), count in duplicates.items(): + print(f" Model '{model}', Question '{question_id}': {count} occurrences") + + # Also show a sample of the problematic data + sample_duplicate = duplicates.index[0] + model, question_id = sample_duplicate + duplicate_rows = df_sim[ + (df_sim["model"] == model) & (df_sim["question_id"] == question_id) + ] + print(f"\nSample duplicate rows for Model '{model}', Question '{question_id}':") + print(duplicate_rows.to_string()) + + # The assertion: no duplicates should exist + assert ( + len(duplicates) == 0 + ), f"Found {len(duplicates)} duplicate [model, question_id] combinations"