feat: breadth-divergence early-warning indicator + event study

Adds a leading-by-construction candidate and the harness to measure whether it actually leads regime breaks, before any of it earns weight in the live index. - breadth_service: % of the stored universe above its own 200-DMA + a divergence score (benchmark price up while breadth falls, nudged by low breadth). Genuinely leading because it keys on divergence, not level. Not wired into the live score. - event_study_service: detect drawdown events on the benchmark, then measure each indicator's median lead time (event-centered) and precision/recall vs. the base rate (signal-centered). Compares breadth-divergence against the deterministic coincident price composite (reuses the regime price sub-scores). Price/breadth only — reproducible, no LLM/FRED. - Manual "Event Study" job (Admin → Jobs), GET /regime/event-study, and an inline early-warning panel on the Regime tab with an honest small-sample caveat. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-26 14:08:52 +02:00
parent ebff19940b
commit 824c15cf69
10 changed files with 719 additions and 2 deletions
@@ -520,6 +520,7 @@ VALID_JOB_NAMES = {
    "alerts",
    "market_regime",
    "regime_monitor",
+    "event_study",
    "backtest",
    "daily_pipeline",
    "intraday_pipeline",
@@ -536,6 +537,7 @@ JOB_LABELS = {
    "alerts": "Alerts Dispatcher",
    "market_regime": "Market Regime",
    "regime_monitor": "Regime Monitor",
+    "event_study": "Event Study",
    "backtest": "Backtest",
    "daily_pipeline": "Daily Pipeline",
    "intraday_pipeline": "Intraday Pipeline",
@@ -0,0 +1,118 @@
+"""Market-breadth early-warning indicator (from the stored universe OHLCV).
+
+Breadth is a genuinely *leading* construct: a few mega-caps can keep an index
+rising while participation narrows underneath — the classic pre-top divergence.
+We measure it from the OHLCV we already store for the whole universe, so it costs
+no new data source.
+
+Two layers:
+  - breadth = % of the universe trading above its own 200-DMA (0-100).
+  - divergence = an early-warning score (0-100, high = fragile): the benchmark
+    price rising *while* breadth falls, plus a nudge for already-low breadth.
+
+This module only *computes* the indicator. It is deliberately NOT wired into the
+live regime index yet — the event study measures whether it actually leads before
+it earns any weight.
+"""
+
+from __future__ import annotations
+
+import logging
+from datetime import date
+
+from sqlalchemy import select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.models.ticker import Ticker
+from app.services.price_service import query_ohlcv
+
+logger = logging.getLogger(__name__)
+
+Series = list[tuple[date, float]]
+
+
+def _breadth_from_closes(
+    closes_by_symbol: dict[str, Series], window: int = 200, min_tickers: int = 20
+) -> dict[date, float]:
+    """Pure core: % of symbols above their own rolling SMA(window), per date.
+
+    Each symbol's SMA is computed once with a sliding sum (O(bars)); dates with
+    fewer than ``min_tickers`` qualifying names are dropped (too thin to trust).
+    """
+    counts: dict[date, list[int]] = {}  # date -> [above, total]
+    for series in closes_by_symbol.values():
+        ordered = sorted(series, key=lambda x: x[0])
+        dates = [d for d, _ in ordered]
+        closes = [c for _, c in ordered]
+        if len(closes) < window:
+            continue
+        running = sum(closes[:window])
+        for i in range(window - 1, len(closes)):
+            if i >= window:
+                running += closes[i] - closes[i - window]
+            sma = running / window
+            entry = counts.setdefault(dates[i], [0, 0])
+            entry[1] += 1
+            if closes[i] > sma:
+                entry[0] += 1
+    return {
+        d: round(above / total * 100.0, 2)
+        for d, (above, total) in counts.items()
+        if total >= min_tickers
+    }
+
+
+def compute_divergence_series(
+    breadth: dict[date, float], benchmark_closes: Series, lookback: int = 20
+) -> dict[date, float]:
+    """Early-warning score (0-100, high = fragile) per date.
+
+    Fragility rises when the benchmark price climbs over ``lookback`` days while
+    breadth deteriorates over the same window, and is nudged up when the absolute
+    breadth level is already low. It is the *divergence* (not the level) that
+    makes this leading.
+    """
+    bench = {d: c for d, c in benchmark_closes}
+    common = sorted(d for d in bench if d in breadth)
+    out: dict[date, float] = {}
+    for i in range(lookback, len(common)):
+        d, d0 = common[i], common[i - lookback]
+        price_past = bench[d0]
+        if price_past <= 0:
+            continue
+        price_ret = (bench[d] / price_past - 1.0) * 100.0   # %
+        breadth_chg = breadth[d] - breadth[d0]              # percentage points
+        raw = price_ret - breadth_chg                        # price up & breadth down -> large
+        score = 50.0 + raw * 2.0 + (50.0 - breadth[d]) * 0.4
+        out[d] = max(0.0, min(100.0, round(score, 2)))
+    return out
+
+
+async def _load_universe_closes(db: AsyncSession) -> dict[str, Series]:
+    result = await db.execute(select(Ticker).order_by(Ticker.symbol))
+    closes_by_symbol: dict[str, Series] = {}
+    for ticker in result.scalars().all():
+        try:
+            records = await query_ohlcv(db, ticker.symbol)
+        except Exception:
+            logger.exception("Breadth: OHLCV load failed for %s", ticker.symbol)
+            continue
+        if records:
+            closes_by_symbol[ticker.symbol] = [(r.date, float(r.close)) for r in records]
+    return closes_by_symbol
+
+
+async def compute_breadth_series(
+    db: AsyncSession, window: int = 200, min_tickers: int = 20
+) -> dict[date, float]:
+    """Historical breadth series across the stored universe (for the event study)."""
+    closes_by_symbol = await _load_universe_closes(db)
+    return _breadth_from_closes(closes_by_symbol, window, min_tickers)
+
+
+async def compute_breadth_today(db: AsyncSession) -> float | None:
+    """Latest breadth reading (thin wrapper, for future live use)."""
+    series = await compute_breadth_series(db)
+    if not series:
+        return None
+    return series[max(series)]
@@ -0,0 +1,294 @@
+"""Event study: does a candidate indicator actually *lead* regime breaks?
+
+This is a backtest-style measurement, but the unit of analysis is **events**
+(historical drawdowns), not trades. For each candidate indicator it answers:
+  - how many days of warning did it give before the break (event-centered)?
+  - at what false-alarm cost (signal-centered precision/recall vs. the base rate)?
+
+It compares the breadth-divergence early-warning candidate against a deterministic
+**coincident** price composite (the existing regime price sub-scores), so you can
+see whether the candidate crosses *earlier*. Everything is price/breadth only —
+no LLM/FRED — so the result is reproducible.
+
+Honest caveat: with only a handful of real drawdowns in ~5y, the sample is tiny
+and the numbers are noisy. Read the median lead time as an order of magnitude, and
+do NOT overfit thresholds to this history.
+
+Report is cached in a SystemSetting (mirrors ``backtest_service``); a manual job
+(Admin → Jobs) drives it.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from datetime import date, datetime, timedelta, timezone
+
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.services import breadth_service, settings_store
+from app.services import regime_monitor_service as rms
+from app.services.admin_service import update_setting
+
+logger = logging.getLogger(__name__)
+
+KEY_REPORT = "regime_event_study"
+
+# Defaults — admin-tunable later if needed.
+EVENT_THRESHOLD_PCT = 15.0   # drawdown from the 52w high that counts as a "break"
+RECOVER_PCT = 5.0            # must recover to within this of the high before a new event
+DRAWDOWN_LOOKBACK = 252      # 52-week trailing high
+HORIZON_DAYS = 20           # signal-centered prediction horizon
+WARN_THRESHOLD = 60.0       # indicator level treated as "warning on"
+PRE, POST = 60, 20          # event-centered window (trading days)
+
+
+def _median(values: list[float]) -> float | None:
+    if not values:
+        return None
+    s = sorted(values)
+    n = len(s)
+    mid = n // 2
+    return float(s[mid]) if n % 2 else (s[mid - 1] + s[mid]) / 2.0
+
+
+# ---------------------------------------------------------------------------
+# Event detection
+# ---------------------------------------------------------------------------
+
+def detect_events(
+    closes: list[float],
+    dates: list[date],
+    threshold_pct: float = EVENT_THRESHOLD_PCT,
+    lookback: int = DRAWDOWN_LOOKBACK,
+    recover_pct: float = RECOVER_PCT,
+) -> list[dict]:
+    """Drawdown events: ``t0`` = first day the drawdown from the trailing 52w high
+    crosses ``threshold_pct``. De-duplicated — a new event needs a recovery back to
+    within ``recover_pct`` of the high first (so one decline = one event)."""
+    events: list[dict] = []
+    in_event = False
+    for i in range(len(closes)):
+        window = closes[max(0, i - lookback + 1): i + 1]
+        hi = max(window)
+        dd = (hi - closes[i]) / hi * 100.0 if hi > 0 else 0.0
+        if not in_event and dd >= threshold_pct:
+            events.append({"date": dates[i].isoformat(), "index": i, "depth_pct": round(dd, 1)})
+            in_event = True
+        elif in_event and dd <= recover_pct:
+            in_event = False
+    return events
+
+
+# ---------------------------------------------------------------------------
+# Event-centered: lead time + mean path
+# ---------------------------------------------------------------------------
+
+def event_centered(
+    indicator: dict[date, float],
+    events_idx: list[int],
+    dates: list[date],
+    pre: int = PRE,
+    post: int = POST,
+    threshold: float = WARN_THRESHOLD,
+) -> dict:
+    """Align the indicator at each event's ``t0`` and measure how early it warned.
+
+    Lead = the earliest day within ``[t0-pre, t0]`` at which the indicator first
+    crosses ``threshold``. Also returns the cross-event mean path.
+    """
+    leads: list[float] = []
+    sums: dict[int, float] = {}
+    counts: dict[int, int] = {}
+    for t0 in events_idx:
+        lead: int | None = None
+        for k in range(0, pre + 1):
+            idx = t0 - k
+            if idx < 0:
+                break
+            v = indicator.get(dates[idx])
+            if v is not None and v >= threshold:
+                lead = k  # keep going: the largest k = earliest warning in the window
+        if lead is not None:
+            leads.append(lead)
+        for rel in range(-pre, post + 1):
+            idx = t0 + rel
+            if 0 <= idx < len(dates):
+                v = indicator.get(dates[idx])
+                if v is not None:
+                    sums[rel] = sums.get(rel, 0.0) + v
+                    counts[rel] = counts.get(rel, 0) + 1
+    mean_path = [
+        {"rel_day": rel, "value": round(sums[rel] / counts[rel], 1)} for rel in sorted(sums)
+    ]
+    return {
+        "median_lead_days": _median(leads),
+        "events_with_signal": len(leads),
+        "events_total": len(events_idx),
+        "mean_path": mean_path,
+    }
+
+
+# ---------------------------------------------------------------------------
+# Signal-centered: precision / recall vs. base rate
+# ---------------------------------------------------------------------------
+
+def signal_centered(
+    indicator: dict[date, float],
+    events_idx: list[int],
+    dates: list[date],
+    horizon: int = HORIZON_DAYS,
+    thresholds: list[float] | None = None,
+) -> dict:
+    """Treat ``indicator >= threshold`` as predicting a break within ``horizon``
+    days. Sweep thresholds → precision/recall/alarm count, plus the base rate."""
+    thresholds = thresholds or [50, 55, 60, 65, 70, 75, 80]
+    n = len(dates)
+    labels = [1 if any(i < e <= i + horizon for e in events_idx) else 0 for i in range(n)]
+    positives = sum(labels)
+    base_rate = positives / n if n else 0.0
+
+    rows: list[dict] = []
+    for th in thresholds:
+        tp = fp = fn = 0
+        for i in range(n):
+            v = indicator.get(dates[i])
+            if v is None:
+                continue
+            pred = v >= th
+            if pred and labels[i]:
+                tp += 1
+            elif pred and not labels[i]:
+                fp += 1
+            elif not pred and labels[i]:
+                fn += 1
+        precision = tp / (tp + fp) if (tp + fp) else None
+        recall = tp / (tp + fn) if (tp + fn) else None
+        rows.append({
+            "threshold": th,
+            "precision": round(precision, 3) if precision is not None else None,
+            "recall": round(recall, 3) if recall is not None else None,
+            "alarms": tp + fp,
+        })
+    return {"base_rate": round(base_rate, 3), "horizon_days": horizon, "rows": rows}
+
+
+# ---------------------------------------------------------------------------
+# Coincident baseline (deterministic price composite, reusing the regime sub-scores)
+# ---------------------------------------------------------------------------
+
+def _coincident_series(prices: dict[str, list], dates: list[date], config: dict) -> dict[date, float]:
+    """Mean of the available price sub-scores (P1-P4) as-of each date — the
+    coincident baseline the leading candidate must beat on lead time."""
+    lw = float(config.get("leader_weight", 2.0))
+    lb = int(config.get("rs_lookback", 60))
+    t = config["tickers"]
+    smh_full = prices.get(t["leaders"][0], []) if t["leaders"] else []
+    qqq_full = prices.get(t["confirm"][0], []) if t["confirm"] else []
+    spy_full = prices.get(t["market"], [])
+    out: dict[date, float] = {}
+    for d in dates:
+        smh = rms._closes_asof(smh_full, d)
+        qqq = rms._closes_asof(qqq_full, d)
+        spy = rms._closes_asof(spy_full, d)
+        subs = [
+            rms.p1_trend_break(smh, qqq, lw),
+            rms.p2_death_cross(smh, qqq, lw),
+            rms.p3_drawdown(smh, qqq),
+            rms.p4_relative_strength(smh, spy, lb),
+        ]
+        vals = [v for v in subs if v is not None]
+        if vals:
+            out[d] = round(sum(vals) / len(vals), 2)
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Orchestration
+# ---------------------------------------------------------------------------
+
+async def run_event_study(
+    db: AsyncSession,
+    threshold_pct: float = EVENT_THRESHOLD_PCT,
+    horizon: int = HORIZON_DAYS,
+    warn_threshold: float = WARN_THRESHOLD,
+) -> dict:
+    """Run the study: detect events on the benchmark, then measure breadth-divergence
+    vs. the coincident price composite. Best-effort; returns available=False on no data."""
+    config = await rms.get_regime_config(db)
+    end = date.today()
+    start = end - timedelta(days=5 * 365 + 30)
+
+    prices = await rms._fetch_prices(config, start, end)
+    leader = config["tickers"]["leaders"][0] if config["tickers"]["leaders"] else "SMH"
+    bench = sorted(prices.get(leader, []), key=lambda x: x[0])
+    if len(bench) < 260:
+        return {"available": False, "reason": "insufficient benchmark history"}
+
+    dates = [d for d, _ in bench]
+    closes = [c for _, c in bench]
+    events = detect_events(closes, dates, threshold_pct)
+    events_idx = [e["index"] for e in events]
+
+    breadth = await breadth_service.compute_breadth_series(db)
+    divergence = breadth_service.compute_divergence_series(breadth, bench)
+    coincident = _coincident_series(prices, dates, config)
+
+    def _evaluate(series: dict[date, float]) -> dict:
+        return {
+            **event_centered(series, events_idx, dates, threshold=warn_threshold),
+            "signal": signal_centered(series, events_idx, dates, horizon),
+        }
+
+    indicators = {
+        "breadth_divergence": _evaluate(divergence),
+        "coincident_price": _evaluate(coincident),
+    }
+
+    bd = indicators["breadth_divergence"]["median_lead_days"]
+    cd = indicators["coincident_price"]["median_lead_days"]
+    lead_delta = (bd - cd) if (bd is not None and cd is not None) else None
+
+    recent_breadth = [
+        {"date": d.isoformat(), "breadth": breadth[d], "divergence": divergence.get(d)}
+        for d in dates[-90:]
+        if d in breadth
+    ]
+
+    report = {
+        "available": True,
+        "generated_at": datetime.now(timezone.utc).isoformat(),
+        "params": {
+            "benchmark": leader,
+            "event_threshold_pct": threshold_pct,
+            "horizon_days": horizon,
+            "warn_threshold": warn_threshold,
+        },
+        "events": events,
+        "indicators": indicators,
+        "lead_delta_days": lead_delta,
+        "recent_breadth": recent_breadth,
+    }
+    logger.info(json.dumps({
+        "event": "event_study_complete", "events": len(events),
+        "breadth_lead": bd, "coincident_lead": cd,
+    }))
+    return report
+
+
+async def run_and_store(db: AsyncSession) -> dict:
+    """Run the event study and cache the report in a SystemSetting. Job entrypoint."""
+    report = await run_event_study(db)
+    await update_setting(db, KEY_REPORT, json.dumps(report))
+    return report
+
+
+async def get_event_study_report(db: AsyncSession) -> dict | None:
+    """Return the last cached event-study report, or None if never run."""
+    setting = await settings_store.get_setting(db, KEY_REPORT)
+    if setting is None:
+        return None
+    try:
+        return json.loads(setting.value)
+    except (TypeError, ValueError):
+        return None