Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 67 additions & 45 deletions src/ranking_sim.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,63 +320,54 @@ def simulate_random_sampling(df, n_questions_per_model, ref_model="Always 0.5"):
"""Simulate a dataset by drawing a n_questions_per_model random questions
(sampled with replacement) from the full sample of questions. ref_model
answers all questions"""
# Get parameters
models = df["model"].unique()
# Extract variables
questions = df["question_id"].unique()
n_models = len(models)
n_questions = len(questions)
models = df["model"].unique()

# Check if ref_model exists
if ref_model is not None and ref_model in models:
# Find ref_model index
ref_model_idx = np.where(models == ref_model)[0][0]

# Create indices for all OTHER models
other_model_indices = np.arange(n_models)
other_model_indices = other_model_indices[other_model_indices != ref_model_idx]

# Sample for other models
all_model_indices = np.repeat(other_model_indices, n_questions_per_model)
all_question_indices = np.concatenate(
[
np.random.choice(n_questions, size=n_questions_per_model, replace=True)
for _ in range(len(other_model_indices))
]
)

# Append reference model with ALL questions at the end
all_model_indices = np.concatenate(
[all_model_indices, np.repeat(ref_model_idx, n_questions)]
)
all_question_indices = np.concatenate(
[all_question_indices, np.arange(n_questions)]
)
else:
if ref_model is None or ref_model not in models:
raise ValueError("Reference model not provided.")

# Convert indices to actual model/question values
sampled_models = models[all_model_indices]
sampled_questions = questions[all_question_indices]

# Create DataFrame of sampled (model, question) pairs
# Include a sample_id to handle duplicates from sampling with replacement
df_samples = pd.DataFrame(
{
"model": sampled_models,
"question_id": sampled_questions,
"sample_id": np.arange(len(sampled_models)), # Unique ID for each sample
}
other_models = [model for model in models if model != ref_model]
n_other_models = len(other_models)

# Draw questions for non-reference models
df_samples = pd.DataFrame()
df_samples["model"] = np.repeat(other_models, n_questions_per_model)
df_samples["question_id"] = np.random.choice(
questions, size=n_questions_per_model * n_other_models, replace=True
)

# Calculate number of occurences of question_id for a given model.
# This is for getting a unique primary key for simulated questions.
# This approach treats the same question_id (from the original dataset)
# resampled k times as k different questions (i.e., k different
# sim_question_id's)
df_samples = df_samples.sort_values(["model", "question_id"]).reset_index(drop=True)
df_samples["occ_question_id"] = (
df_samples.groupby(["model", "question_id"]).cumcount() + 1
)

# Single merge operation to get all data
# The sample_id ensures we keep duplicates when sampling with replacement
# Create a unique primary key for simulated questions
df_samples["sim_question_id"] = (
df_samples["question_id"] + "-" + df_samples["occ_question_id"].astype(str)
)

# Add reference model
df_temp = df_samples[["question_id", "sim_question_id"]].copy().drop_duplicates()
df_temp["model"] = ref_model
df_samples = pd.concat([df_samples, df_temp], ignore_index=True)

# Get data on forecasts and realizations from the original dataset
df_results = df_samples.merge(
df[["model", "question_id", "forecast", "resolved_to", "question_type"]],
on=["model", "question_id"],
how="left",
)

# Clean up
df_results["question_id"] = df_results["sim_question_id"]
df_results = df_results.drop(["occ_question_id", "sim_question_id"], axis=1)
df_results = df_results.reset_index(drop=True)

return df_results
Expand Down Expand Up @@ -407,11 +398,15 @@ def fixed_dataset_market_question_sample(df, n):
# The remainder
n_market = n - n_dataset

# Since sampling is done with replacement, we need at least
# 1 market and at least 1 dataset questions, and the required
# number of market questions to sample should be >= 1
if (
n_dataset_horizon < 1
or n_market < 1
or n_dataset > len(df["question_type"] == "dataset")
or n_market > len(df["question_type"] == "market")
or len(dataset_groups.values[0]) < 1 # At least one dataset question exists
or len(df[df["question_type"] == "market"])
< 1 # At least one market question exists
):
raise ValueError(
f"`fixed_dataset_market_question_sample()` needs a bigger `n`. It was "
Expand Down Expand Up @@ -667,6 +662,28 @@ def simulate_round_based(
# Create DataFrame from all samples
df_samples = pd.DataFrame(data_rows)

# Create a unique primary key for simulated questions.
# A question with the same question_id (from the original dataset)
# that is drawn in round R and R + 1 is treated as a different question
# (i.e., will have different sim_question_id's). Also,
# if the same question is drawn k > 1 times in the same round R,
# it will also get different sim_question_id's.
df_samples = df_samples.sort_values(
["model", "round_id", "question_id"], ascending=True
).reset_index(drop=True)
df_samples["occ_question_id"] = (
df_samples.groupby(["model", "round_id", "question_id"]).cumcount() + 1
)

# Create unique sim_question_id for within-round duplicates
df_samples["sim_question_id"] = (
df_samples["question_id"]
+ "-R"
+ df_samples["round_id"].astype(str)
+ "-"
+ df_samples["occ_question_id"].astype(str)
)

# Merge with original data to get forecasts and outcomes
# round_id ensures uniqueness during merge
df_results = df_samples.merge(
Expand All @@ -675,6 +692,11 @@ def simulate_round_based(
how="left",
)

# Clean up
df_results["question_id"] = df_results["sim_question_id"]
df_results = df_results.drop(["occ_question_id", "sim_question_id"], axis=1)
df_results = df_results.reset_index(drop=True)

return df_results


Expand Down
152 changes: 131 additions & 21 deletions tests/test_ranking_sim.py
Original file line number Diff line number Diff line change
Expand Up @@ -513,7 +513,7 @@ def test_fixed_dataset_market_question_sample_success(
assert all(f"{source}_{qid}_{h}" in questions for h in dataset_horizons)


@pytest.mark.parametrize("N", [99, 3, 0])
@pytest.mark.parametrize("N", [3, 0])
def test_fixed_dataset_market_question_sample_errors(df_for_sampling, N):
"""Test error is thrown for bad values of `N`."""
with pytest.raises(ValueError):
Expand Down Expand Up @@ -1266,10 +1266,9 @@ def test_simulate_random_sampling():
# Test with 20% overlap
df_sim = simulate_random_sampling(df, n_questions_per_model=2, ref_model="A")

# Check that ref model A has all questions
# Check that ref model A answers all questions
a_questions = df_sim[df_sim["model"] == "A"]["question_id"].unique()
assert len(a_questions) == 3 # All 3 questions
assert set(a_questions) == {"q1", "q2", "q3"}
assert set(a_questions) == set(df_sim["question_id"].unique())

# Check that other models have fewer questions
b_questions = df_sim[df_sim["model"] == "B"]["question_id"].values
Expand All @@ -1278,22 +1277,19 @@ def test_simulate_random_sampling():
assert len(c_questions) == 2

# Check that all data is preserved correctly
df_sim["orig_question_id"] = (
df_sim["question_id"].str.rsplit("-", n=1).str[0]
) # Get the corresponding original question_id
for _, row in df_sim.iterrows():
# Find corresponding row in original
mask = (df["model"] == row["model"]) & (df["question_id"] == row["question_id"])
mask = (df["model"] == row["model"]) & (
df["question_id"] == row["orig_question_id"]
)
orig_row = df[mask].iloc[0]
assert row["forecast"] == orig_row["forecast"]
assert row["resolved_to"] == orig_row["resolved_to"]
assert row["question_type"] == orig_row["question_type"]

# Test with n_questions_per_model=3
df_sim_full = simulate_random_sampling(df, n_questions_per_model=3, ref_model="A")

# Each model should have all 3 questions
for model in ["A", "B", "C"]:
model_samples = len(df_sim_full[df_sim_full["model"] == model])
assert model_samples == 3

# Test that ref_model must exist
with pytest.raises(ValueError, match="Reference model not provided"):
simulate_random_sampling(df, n_questions_per_model=2, ref_model="NonExistent")
Expand Down Expand Up @@ -1451,7 +1447,7 @@ def test_evaluate_ranking_methods_oracle():
# Oracle should almost always be ranked #1
top1_retention = method_results["Top-1 Retention"].mean()
assert (
top1_retention > 0.90
top1_retention > 0.85
), f"Oracle should almost always be top-1 \
with {method}, got {top1_retention}"

Expand Down Expand Up @@ -2043,7 +2039,7 @@ def test_simulation_regression_results():
- n_questions_per_model: 125
- dataset_weight: 0.5
- simulation_method: "random_sampling"
- ref_model = "GPT-4 (zero shot)"
- ref_model = "Naive Forecaster"
"""
np.random.seed(20250527)

Expand Down Expand Up @@ -2077,10 +2073,10 @@ def test_simulation_regression_results():

# Expected results (from known good run)
expected_results = {
"Brier": {"Spearman": 0.726376, "Top-20 Retention": 0.503},
"Diff-Adj. Brier": {"Spearman": 0.813342, "Top-20 Retention": 0.582},
"Brier": {"Spearman": 0.726328, "Top-20 Retention": 0.503},
"Diff-Adj. Brier": {"Spearman": 0.812766, "Top-20 Retention": 0.589},
"BSS": {"Spearman": 0.134529, "Top-20 Retention": 0.321},
"Peer Score": {"Spearman": 0.813029, "Top-20 Retention": 0.590},
"Peer Score": {"Spearman": 0.811932, "Top-20 Retention": 0.577},
}

# Check results
Expand Down Expand Up @@ -2350,7 +2346,12 @@ def test_difficulty_drift_hard_share_grows():
skill_temperature=None,
difficulty_temperature=lambda r: beta[r],
)
hard = (sim["question_id"] == "hard").groupby(sim["round_id"]).any()
hard = (
sim["question_id"]
.apply(lambda x: x[0:4] == "hard")
.groupby(sim["round_id"])
.any()
)
hard_counts += hard.reindex(range(R), fill_value=False).astype(int).values
hard_share = hard_counts / REP

Expand Down Expand Up @@ -2414,9 +2415,9 @@ def test_simulation_regression_round_based_results():
# Expected results (from known good run)
expected_results = {
"Brier": {"Spearman": 0.694564, "Top-20 Retention": 0.476},
"Diff-Adj. Brier": {"Spearman": 0.771530, "Top-20 Retention": 0.557},
"Diff-Adj. Brier": {"Spearman": 0.770730, "Top-20 Retention": 0.560},
"BSS": {"Spearman": 0.145300, "Top-20 Retention": 0.319},
"Peer Score": {"Spearman": 0.770672, "Top-20 Retention": 0.564},
"Peer Score": {"Spearman": 0.768716, "Top-20 Retention": 0.561},
}

# Check results
Expand Down Expand Up @@ -2469,3 +2470,112 @@ def test_persistence_60_percent():
r1 = set(sim.loc[sim.round_id == 1, "model"]) - {"Always 0.5"}

assert len(r0 & r1) == np.floor(0.6 * len(r0)) # 60 % persistence


@pytest.mark.parametrize("simulation_type", ["random", "round_based"])
@pytest.mark.parametrize(
"simulation_kwargs",
[
{"n_questions_per_model": 10}, # For random sampling only
{
"n_rounds": 3,
"questions_per_round": 5,
"models_per_round_mean": 3,
}, # For round_based only
{
"n_rounds": 2,
"questions_per_round": 8,
"models_per_round_mean": 4,
"model_persistence": 0.5,
}, # Round_based with persistence
{
"n_rounds": 4,
"questions_per_round": 3,
"models_per_round_mean": 2,
"fixed_models_per_round": True,
}, # Round_based fixed models
],
)
def test_no_duplicates_at_model_question_level(simulation_type, simulation_kwargs):
"""Test that there are no duplicate [model, question_id] combinations in
simulation results.

This ensures that each model answers each question at most once, which is critical
for proper ranking calculations.
"""
# Create test dataset with sufficient questions and models
models = ["RefModel", "ModelA", "ModelB", "ModelC", "ModelD", "ModelE"]
question_ids = [f"q{i}" for i in range(20)]

data = []
for model in models:
for question_id in question_ids:
data.append(
{
"model": model,
"question_id": question_id,
"forecast": np.random.uniform(0.1, 0.9),
"resolved_to": np.random.choice([0, 1]),
"question_type": "dataset",
}
)

df = pd.DataFrame(data)

# Set seed for reproducibility
np.random.seed(42)

# Run appropriate simulation based on type
if simulation_type == "random":
# Only use kwargs relevant to random sampling
relevant_kwargs = {
k: v for k, v in simulation_kwargs.items() if k in ["n_questions_per_model"]
}
if not relevant_kwargs:
pytest.skip("No relevant kwargs for random sampling")

df_sim = simulate_random_sampling(df, ref_model="RefModel", **relevant_kwargs)

elif simulation_type == "round_based":
# Only use kwargs relevant to round-based sampling
relevant_kwargs = {
k: v
for k, v in simulation_kwargs.items()
if k
in [
"n_rounds",
"questions_per_round",
"models_per_round_mean",
"model_persistence",
"fixed_models_per_round",
]
}
if "n_rounds" not in relevant_kwargs:
pytest.skip("No relevant kwargs for round-based sampling")

df_sim = simulate_round_based(df, ref_model="RefModel", **relevant_kwargs)

# Check for duplicates at [model, question_id] level
duplicate_check = df_sim.groupby(["model", "question_id"]).size()
duplicates = duplicate_check[duplicate_check > 1]

if len(duplicates) > 0:
print(f"\nFound duplicates in {simulation_type} simulation:")
print(f"Simulation kwargs: {simulation_kwargs}")
print("Duplicate combinations:")
for (model, question_id), count in duplicates.items():
print(f" Model '{model}', Question '{question_id}': {count} occurrences")

# Also show a sample of the problematic data
sample_duplicate = duplicates.index[0]
model, question_id = sample_duplicate
duplicate_rows = df_sim[
(df_sim["model"] == model) & (df_sim["question_id"] == question_id)
]
print(f"\nSample duplicate rows for Model '{model}', Question '{question_id}':")
print(duplicate_rows.to_string())

# The assertion: no duplicates should exist
assert (
len(duplicates) == 0
), f"Found {len(duplicates)} duplicate [model, question_id] combinations"