Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 38 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ OHLCV data is hosted on Google Drive. Use the `portbench` CLI to download it:
```bash
pip install gdown
portbench download-data --exchange portfoliobench # crypto + US stocks + global indices
portbench download-data --exchange polymarket # Polymarket prediction-market contracts
portbench download-data --exchange polymarket # Polymarket prediction-market contracts, orderbooks and markets metadata
```

This downloads feather files into the data directory (`user_data/data/portfoliobench/` or `user_data/data/polymarket/`).
Expand Down Expand Up @@ -264,6 +264,42 @@ PortfolioBench supports backtesting on **Polymarket**, a decentralized predictio
bash utils/backtest_polymarket.bash
```

### Orderbook Dataset Pipeline

To build a feature-enriched orderbook dataset from raw Polymarket data, use the pipeline script in `dataset/polymarket_orderbook/`:

```bash
# Run the full pipeline with defaults (2025-10-14 → 2026-03-31, top 70 markets)
python dataset/polymarket_orderbook/run_pipeline.py
```

The pipeline runs three stages in sequence:

| Stage | Script | Output |
|-------|--------|--------|
| 1. Parse | `parser.py` | `markets.parquet`, `tokens.parquet`, `filtered_token_ids.parquet` |
| 2. Fetch | `fetch_orderbook.py` | `raw_orderbook/ob_<token_id>.parquet` per token |
| 3. Features | `orderbook_feature_generation.py` | `feat_orderbook/feat_<token_id>.parquet` per token |

The pipeline is resumable — already-fetched tokens are skipped automatically.

**Common options:**

```bash
# Custom date range
python dataset/polymarket_orderbook/run_pipeline.py --start-date 2025-01-01 --end-date 2025-06-30

# Skip stages already completed
python dataset/polymarket_orderbook/run_pipeline.py --skip-parse
python dataset/polymarket_orderbook/run_pipeline.py --skip-parse --skip-fetch

# Re-fetch all orderbooks even if already saved
python dataset/polymarket_orderbook/run_pipeline.py --skip-parse --force

# Filter by market count or end date
python dataset/polymarket_orderbook/run_pipeline.py --top-markets 100 --min-end-date 2025-06-01
```

---

## Hyperparameter Optimization
Expand Down Expand Up @@ -330,7 +366,7 @@ PortfolioBench/
├── benchmark_all.py # Full benchmark matrix runner
├── cli.py # CLI entry point
├── generate_report.py # Report generation utilities
├── dataset/ # Data management module (placeholder)
├── dataset/polymarket_orderbook/ # Orderbook dataset pipeline (parse → fetch → features)
├── tests/ # Unit and integration tests
├── user_data/data/usstock/ # 357 OHLCV feather files (download from Google Drive)
└── utils/ # Bash helpers for backtesting and data generation
Expand Down
103 changes: 103 additions & 0 deletions alpha/OrderbookAlpha.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
"""Orderbook-derived alpha for Polymarket contracts.

Loads a pre-computed feature parquet (from run_pipeline.py), backward-fills
the latest orderbook snapshot onto each OHLCV candle, and adds two columns:

ob_imbalance : raw top-3-level orderbook imbalance [-1, 1]
ob_imbalance_ema: EMA-smoothed version (span=ema_span candles)

Token-id is resolved automatically from the pair name using:
freqtrade_pair_mapping.csv → condition_id
markets.parquet → market_id
tokens.parquet → token_id
"""

import re
from pathlib import Path

import numpy as np
import pandas as pd

from alpha.interface import IAlpha

_DATA_DIR = Path(__file__).resolve().parents[1] / "user_data" / "data" / "polymarket"
_FEATURE_DIR = _DATA_DIR / "feat_orderbook"

# Cached once per process: pair_base → token_id
_pair_token_map: dict[str, str] | None = None


def _build_pair_token_map() -> dict[str, str]:
mapping = pd.read_csv(_DATA_DIR / "freqtrade_pair_mapping.csv")
markets = pd.read_parquet(_DATA_DIR / "markets.parquet", columns=["market_id", "condition_id"])
tokens = pd.read_parquet(_DATA_DIR / "tokens.parquet", columns=["token_id", "market_id", "outcome"])

cond_to_market = dict(zip(markets["condition_id"].str.lower(), markets["market_id"].astype(str)))
tok_idx = {
(str(r.market_id), r.outcome.strip().lower()): str(r.token_id)
for r in tokens.itertuples(index=False)
}

result: dict[str, str] = {}
for _, row in mapping.iterrows():
cond_id = str(row["Original_Condition_ID"]).lower()
market_id = cond_to_market.get(cond_id)
if not market_id:
continue

# "SomePairYES20250430_USDC-4h.feather" → pair_base = "SomePairYES20250430"
stem = re.sub(r"-\d+[mhd]$", "", str(row["New_Filename"]).removesuffix(".feather"))
if "_" not in stem:
continue
pair_base = stem.rsplit("_", 1)[0]

outcome_key = "yes" if "YES" in pair_base.upper() else "no" if "NO" in pair_base.upper() else None
if not outcome_key:
continue

token_id = tok_idx.get((market_id, outcome_key))
if token_id:
result[pair_base] = token_id

return result


def _lookup_token_id(pair: str) -> str:
global _pair_token_map
if _pair_token_map is None:
_pair_token_map = _build_pair_token_map()
return _pair_token_map.get(pair.split("/")[0], "")


class OrderbookAlpha(IAlpha):
def __init__(self, dataframe: pd.DataFrame, metadata: dict = None, ema_span: int = 8):
self.ema_span = ema_span
super().__init__(dataframe, metadata)

def process(self) -> pd.DataFrame:
df = self.dataframe
token_id = self.metadata.get("token_id") or _lookup_token_id(self.metadata.get("pair", ""))

feat_path = _FEATURE_DIR / f"feat_{token_id}.parquet"
if not token_id or not feat_path.exists():
df["ob_imbalance"] = np.nan
df["ob_imbalance_ema"] = np.nan
return df

feat = pd.read_parquet(feat_path, columns=["snapshot_time", "imbalance_3"])
feat["snapshot_time"] = pd.to_datetime(feat["snapshot_time"]).dt.tz_localize(None)
feat = feat.sort_values("snapshot_time").reset_index(drop=True)

candle_dates = pd.to_datetime(df["date"]).dt.tz_localize(None)
order = np.argsort(candle_dates.values)
left = pd.DataFrame({"date": candle_dates.iloc[order].values, "_idx": order})
merged = (
pd.merge_asof(left, feat.rename(columns={"snapshot_time": "date"}), on="date", direction="backward")
.sort_values("_idx")
.reset_index(drop=True)
)

imb = merged["imbalance_3"].fillna(0.0)
df["ob_imbalance"] = imb.values
df["ob_imbalance_ema"] = imb.ewm(span=self.ema_span, adjust=False).mean().values
return df
Empty file.
75 changes: 75 additions & 0 deletions dataset/polymarket_orderbook/fetch_orderbook.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import argparse
import sys
from pathlib import Path

import pandas as pd

_PROJECT_ROOT = Path(__file__).resolve().parents[2]
if str(_PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(_PROJECT_ROOT))

from dataset.polymarket_orderbook.utils.paths import FILTERED_TOKEN_IDS, ORDERBOOK_DIR, ensure_dirs
from dataset.polymarket_orderbook.utils.fetch_orderbook import fetch_orderbook_from_ids_async


def parse_args():
parser = argparse.ArgumentParser(
description="Fetch Polymarket orderbook snapshots from DomeAPI for filtered token IDs."
)
parser.add_argument("--filtered-tokens-path", type=Path, default=FILTERED_TOKEN_IDS)
parser.add_argument("--raw-orderbook-dir", type=Path, default=ORDERBOOK_DIR)
parser.add_argument("--start-date", type=str, default="2025-10-14")
parser.add_argument("--end-date", type=str, default="2026-03-31")
parser.add_argument("--max-concurrent", type=int, default=3)
parser.add_argument("--batch-size", type=int, default=10,
help="Number of tokens to process per batch before freeing memory (default: 10).")
parser.add_argument(
"--force",
action="store_true",
help="Re-fetch tokens that already have a saved parquet file.",
)
return parser.parse_args()


def main():
args = parse_args()
ensure_dirs()
args.raw_orderbook_dir.mkdir(parents=True, exist_ok=True)

# Load token IDs from the filtered parquet produced by polymarket_parser.py
token_ids_df = pd.read_parquet(args.filtered_tokens_path)
all_token_ids = token_ids_df["token_id"].astype(str).tolist()
print(f"Loaded {len(all_token_ids)} token IDs from {args.filtered_tokens_path}")

# Resume: skip tokens whose output file already exists
if args.force:
token_ids = all_token_ids
else:
already_done = {
p.stem[len("ob_"):] for p in args.raw_orderbook_dir.glob("ob_*.parquet")
}
token_ids = [t for t in all_token_ids if t not in already_done]
if already_done:
print(
f"Skipping {len(already_done)} already-fetched token(s). "
f"{len(token_ids)} remaining. Use --force to re-fetch all."
)

if not token_ids:
print("Nothing to fetch.")
return

print(f"Fetching orderbook for {len(token_ids)} token(s) ({args.start_date} → {args.end_date})")

fetch_orderbook_from_ids_async(
token_ids=token_ids,
start_date=args.start_date,
end_date=args.end_date,
output_path=args.raw_orderbook_dir,
max_concurrent=args.max_concurrent,
batch_size=args.batch_size,
)


if __name__ == "__main__":
main()
87 changes: 87 additions & 0 deletions dataset/polymarket_orderbook/orderbook_feature_generation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import argparse
import sys
from pathlib import Path

import pandas as pd

_PROJECT_ROOT = Path(__file__).resolve().parents[2]
if str(_PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(_PROJECT_ROOT))

from dataset.polymarket_orderbook.utils.paths import (
DATA_DIR,
FILTERED_TOKEN_IDS,
ORDERBOOK_DIR,
FEATURE_DIR,
ensure_dirs,
)
from dataset.polymarket_orderbook.utils.orderbook_feature_generation import build_token_feature_table_from_parquet


def parse_args():
parser = argparse.ArgumentParser(description="Build per-token orderbook feature parquet files.")
parser.add_argument("--tokens-path", type=Path, default=DATA_DIR / "tokens.parquet")
parser.add_argument("--filtered-tokens-path", type=Path, default=FILTERED_TOKEN_IDS)
parser.add_argument("--raw-orderbook-dir", type=Path, default=ORDERBOOK_DIR)
parser.add_argument("--feat-orderbook-dir", type=Path, default=FEATURE_DIR)
parser.add_argument("--depth-n", type=int, default=3)
parser.add_argument("--drop-json-cols", action="store_true", default=True)
parser.add_argument(
"--process-all-files",
action="store_true",
help="Ignore filtered token list and process all ob_*.parquet files.",
)
return parser.parse_args()


def main():
args = parse_args()
ensure_dirs()
args.feat_orderbook_dir.mkdir(parents=True, exist_ok=True)

# Load filtered token IDs
relevant_token_ids = set(pd.read_parquet(args.filtered_tokens_path)["token_id"].astype(str))
print(f"Loaded {len(relevant_token_ids)} filtered token IDs")

# Load token metadata, restricted to filtered set
tokens_df = pd.read_parquet(args.tokens_path)
tokens_df = tokens_df[tokens_df["token_id"].astype(str).isin(relevant_token_ids)]
print(f"Token metadata rows after filter: {len(tokens_df):,}")

files = sorted(args.raw_orderbook_dir.glob("ob_*.parquet"))
if not files:
print(f"No ob_*.parquet files found in {args.raw_orderbook_dir}")
return

if not args.process_all_files:
files = [p for p in files if p.stem[len("ob_"):] in relevant_token_ids]

print(f"Files to process: {len(files)}")

success = 0
failed = 0

for i, file_path in enumerate(files, start=1):
token_stub = file_path.stem[len("ob_"):]
output_path = args.feat_orderbook_dir / f"feat_{token_stub}.parquet"

print(f"[{i}/{len(files)}] {file_path.name}")
try:
feat_df = build_token_feature_table_from_parquet(
input_path=file_path,
output_path=output_path,
depth_n=args.depth_n,
drop_json_cols=args.drop_json_cols,
token_meta_df=tokens_df,
)
print(f" Saved {len(feat_df):,} rows -> {output_path.name}")
success += 1
except Exception as e:
print(f" Failed: {e}")
failed += 1

print(f"Done. Success: {success}, Failed: {failed}")


if __name__ == "__main__":
main()
65 changes: 65 additions & 0 deletions dataset/polymarket_orderbook/parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import argparse
import sys
from pathlib import Path

import pandas as pd

# Ensure project root is on sys.path so dataset.utils imports work
_PROJECT_ROOT = Path(__file__).resolve().parents[2]
if str(_PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(_PROJECT_ROOT))

from dataset.polymarket_orderbook.utils.paths import DATA_DIR, MARKET_JSONL, FILTERED_TOKEN_IDS, ensure_dirs
from dataset.polymarket_orderbook.utils.parser import read_market_to_df, read_token_to_df


def parse_args():
parser = argparse.ArgumentParser(description="Parse polymarket JSONL into markets, tokens, and filtered token parquet files.")
parser.add_argument("--market-jsonl", type=Path, default=MARKET_JSONL)
parser.add_argument("--markets-path", type=Path, default=DATA_DIR / "markets.parquet")
parser.add_argument("--tokens-path", type=Path, default=DATA_DIR / "tokens.parquet")
parser.add_argument("--filtered-tokens-path", type=Path, default=FILTERED_TOKEN_IDS)
parser.add_argument("--min-end-date", type=str, default="2026-01-01")
parser.add_argument("--top-markets", type=int, default=70)
return parser.parse_args()


def main():
args = parse_args()
ensure_dirs()

# 1) Markets table
markets_df = read_market_to_df(args.market_jsonl)
print("markets_df shape:", markets_df.shape)
markets_df.to_parquet(args.markets_path, index=False)
print(f"Saved: {args.markets_path}")

# 2) Select relevant markets
working = markets_df.dropna(subset=["question", "market_id", "end_date", "closed_time"]).copy()
working["end_date_ts"] = pd.to_datetime(working["end_date"], errors="coerce", utc=True)
min_end_ts = pd.Timestamp(args.min_end_date, tz="UTC")

markets_filtered_df = (
working[working["end_date_ts"] > min_end_ts]
.sort_values("volume", ascending=False)
.head(args.top_markets)
)
relevant_market_ids = set(markets_filtered_df["market_id"].astype(str))
print(f"Selected markets: {len(relevant_market_ids)}")

# 3) Tokens table
tokens_df = read_token_to_df(args.market_jsonl)
print("tokens_df shape:", tokens_df.shape)
tokens_df.to_parquet(args.tokens_path, index=False)
print(f"Saved: {args.tokens_path}")

# 4) Filter token IDs by selected markets
tokens_filtered_df = tokens_df[tokens_df["market_id"].astype(str).isin(relevant_market_ids)].copy()
relevant_token_ids = sorted(set(tokens_filtered_df["token_id"].astype(str)))

pd.DataFrame({"token_id": relevant_token_ids}).to_parquet(args.filtered_tokens_path, index=False)
print(f"Saved token ID list ({len(relevant_token_ids)}): {args.filtered_tokens_path}")


if __name__ == "__main__":
main()
Loading
Loading