"""
ZenHodl — Sports Prediction Market Backtest Pack
Quickstart notebook
====================================================

This file works as a standalone Python script (run `python3 quickstart.py`) OR
as a Jupyter notebook (split on `# %%` markers, or open in VSCode / PyCharm).

What you'll learn in ~5 minutes:
  1. How to load the Parquet files (DuckDB and pandas, take your pick)
  2. How to filter to one game (we use GSW @ MIN, Jan 24 2026, Warriors 111-85)
  3. How to read the score-synced game state on the Kalshi side
  4. How to join Polymarket ↔ Kalshi by (team, time) for cross-venue analysis
  5. A worked example: spread behavior on Kalshi as score swings

This entire notebook runs on the FREE 30-row sample CSV. To run it against the
real Parquet files, point ARCHIVE_DIR at wherever you unzipped your purchase.

Requirements:
    pip install duckdb pandas pyarrow

Optional (for plotting):
    pip install matplotlib
"""

# %% [markdown]
# # 1. Setup

# %%
from pathlib import Path

# Point this at your unzipped archive directory. When you unzip
# poly_orderbook_archive.zip the contents land in a folder called
# `poly_orderbook_archive/` containing both parquet files plus README +
# DATA_CARD + LICENSE + SCHEMA. Adjust the path if you unzipped elsewhere.
ARCHIVE_DIR = Path("./poly_orderbook_archive")  # contains poly_snapshots.parquet + kalshi_snapshots.parquet

POLY_FILE = ARCHIVE_DIR / "poly_snapshots.parquet"
KALSHI_FILE = ARCHIVE_DIR / "kalshi_snapshots.parquet"

print(f"Poly file:   {POLY_FILE} (exists: {POLY_FILE.exists()})")
print(f"Kalshi file: {KALSHI_FILE} (exists: {KALSHI_FILE.exists()})")

# %% [markdown]
# # 2. Load via DuckDB (fastest — recommended for big queries)
#
# DuckDB reads Parquet directly without loading the whole file into RAM.
# This means you can query a 1.5 GB file on a laptop with 8 GB RAM. Pandas
# users can load the full file into memory with `pd.read_parquet()` — see §3.

# %%
import duckdb

# DuckDB connection. In-memory database; no setup files written.
con = duckdb.connect(":memory:")

# Quick row counts to confirm everything loads
poly_total = con.execute(
    f"SELECT COUNT(*) FROM read_parquet('{POLY_FILE}')"
).fetchone()[0]
kalshi_total = con.execute(
    f"SELECT COUNT(*) FROM read_parquet('{KALSHI_FILE}')"
).fetchone()[0]

print(f"Polymarket rows: {poly_total:>12,}")
print(f"Kalshi rows:     {kalshi_total:>12,}")
print(f"Combined:        {poly_total + kalshi_total:>12,}")

# %% [markdown]
# # 3. Pandas alternative
#
# If you prefer pandas (or you're working with the burst/event/score variants
# from the Kalshi Raw Tick Archive), use `pd.read_parquet`. Filter as much
# as possible BEFORE materializing into memory.

# %%
import pandas as pd

# Filter at read-time using PyArrow filters — much faster than loading
# everything and then filtering.
sample = pd.read_parquet(
    KALSHI_FILE,
    filters=[("ticker", "==", "KXNBAGAME-26JAN24GSWMIN-GSW")],
)
print(f"GSW (away side) moneyline rows: {len(sample):,}")
print(sample[["datetime_utc", "yes_bid", "yes_ask", "mid", "home_score", "away_score", "game_state"]].head(5))

# %% [markdown]
# # 4. Worked example: GSW @ MIN, Jan 24 2026
#
# Warriors beat Timberwolves 111-85. Let's see what the orderbook did during
# the game. Score-synced game state is on the Kalshi side, so we'll use Kalshi
# as the "spine" and join Polymarket later by timestamp.

# %%
# Filter the GSW moneyline (the "Warriors win" YES token)
gsw_book = con.execute(f"""
    SELECT
        datetime_utc,
        yes_bid,
        yes_ask,
        mid,
        spread_cents,
        home_score,
        away_score,
        period,
        time_remaining,
        game_state,
        yes_bid_size,
        yes_ask_size
    FROM read_parquet('{KALSHI_FILE}')
    WHERE ticker = 'KXNBAGAME-26JAN24GSWMIN-GSW'
    ORDER BY datetime_utc
""").df()

print(f"GSW moneyline snapshots: {len(gsw_book):,}")
print()
print("Pregame open (first row):")
print(gsw_book.iloc[0][["datetime_utc", "mid", "spread_cents", "game_state"]].to_string())
print()
print("First-quarter mid-game (period=1, 5+ minutes in):")
q1 = gsw_book[(gsw_book["period"] == 1) & (gsw_book["game_state"] == "live")]
if len(q1) > 0:
    print(q1.iloc[len(q1) // 2][["datetime_utc", "mid", "spread_cents", "home_score", "away_score"]].to_string())
print()
print("Final state (game_state='final', last row):")
final = gsw_book[gsw_book["game_state"] == "final"]
if len(final) > 0:
    print(final.iloc[-1][["datetime_utc", "mid", "yes_bid", "yes_ask", "home_score", "away_score"]].to_string())

# %% [markdown]
# # 5. Score-reaction analysis (the unique-moat demo)
#
# Here's the thing this dataset uniquely enables: align orderbook prices to
# the EXACT moment a score change happens, and measure the reprice speed +
# magnitude. No other commercial dataset has this row-by-row join.

# %%
# Find score-change moments: any row where score is different from the prior row
gsw_book["score_changed"] = (
    (gsw_book["home_score"] != gsw_book["home_score"].shift(1))
    | (gsw_book["away_score"] != gsw_book["away_score"].shift(1))
) & (gsw_book["game_state"] == "live")

score_change_moments = gsw_book[gsw_book["score_changed"]].copy()
print(f"Score-change moments captured: {len(score_change_moments)}")
print()

# For each score change, show: time, score, mid price, spread
print("Sample score events (first 10):")
print(score_change_moments[
    ["datetime_utc", "home_score", "away_score", "period", "time_remaining", "mid", "spread_cents"]
].head(10).to_string(index=False))

# %% [markdown]
# # 6. Cross-venue join: Polymarket ↔ Kalshi at the same moment
#
# Polymarket doesn't expose `game_id` directly — it uses event slugs. The
# join key is (event_title contains team names, datetime_utc bucket).
# This is an as-of join: for each Kalshi tick, find the nearest Polymarket
# tick.

# %%
# Pull the matching Polymarket data: Warriors vs. Timberwolves on Jan 24
poly_book = con.execute(f"""
    SELECT
        datetime_utc,
        question,
        yes_bid AS poly_bid,
        yes_ask AS poly_ask,
        mid AS poly_mid,
        spread AS poly_spread,
        total_bid_depth,
        total_ask_depth
    FROM read_parquet('{POLY_FILE}')
    WHERE event_title = 'Warriors vs. Timberwolves'
      AND datetime_utc >= TIMESTAMP '2026-01-24 00:00:00'
      AND datetime_utc <  TIMESTAMP '2026-01-25 00:00:00'
      AND question LIKE '%Warriors%'
    ORDER BY datetime_utc
""").df()
print(f"Polymarket rows for the Warriors side: {len(poly_book):,}")
print(poly_book.head(3))

# %%
# As-of join: each Kalshi tick gets the most-recent Polymarket tick at or before it
# pandas has merge_asof for this; here it is:
import pandas as pd

merged = pd.merge_asof(
    gsw_book.sort_values("datetime_utc"),
    poly_book.sort_values("datetime_utc"),
    on="datetime_utc",
    tolerance=pd.Timedelta(seconds=30),  # cross-venue divergence allowed up to 30s
    direction="backward",
)
print(f"Merged rows: {len(merged):,}")
print(f"Rows with both venues present: {merged['poly_mid'].notna().sum():,}")
print()
print("Cross-venue snapshot at the final moment:")
last_both = merged[merged["poly_mid"].notna()].iloc[-1]
print(f"  Kalshi mid: {last_both['mid']:.4f}  |  Poly mid: {last_both['poly_mid']:.4f}")
print(f"  Kalshi spread: {last_both['spread_cents']:.2f}c  |  Poly spread: {last_both['poly_spread']*100:.2f}c")
print(f"  Score at this moment: {last_both['away_score']} away, {last_both['home_score']} home")

# %% [markdown]
# # 7. Where to go from here
#
# Common next steps:
#   - **Arbitrage scanner:** scan the merged dataframe for `abs(mid - poly_mid) > N`
#     plus liquidity gates to find cross-venue mispricings.
#   - **Score-reaction modeling:** measure `Δmid` and `Δspread` in the 10 seconds
#     following each score-change moment. Fit per-sport regressions.
#   - **Fill-probability:** use `bid_depth_5c` / `ask_depth_5c` against future
#     mid moves to model fill probability for resting orders.
#   - **Microstructure ML features:** join open_interest + liquidity_score +
#     score_diff + period + time_remaining as features for a directional model.
#
# Questions: admin@zenhodl.net
"""
"""