add backtest harness (Phase 1): historical replay + hit-rate & calibration reports

Replays the price-derived engine over stored OHLCV: at each weekly as-of date, rebuild the setup from bars <= D (no lookahead) and walk the actual forward bars for the realized outcome. Reports realized hit-rate/expectancy of qualified setups (and all setups, by direction) plus a probability calibration curve (predicted target prob vs realized hit rate). Reuses pure functions throughout; extracted compute_technical_from_arrays / compute_momentum_from_closes from scoring_service so live and backtest stay in sync. Runs as a weekly/triggerable 'backtest' job caching the report in a SystemSetting; GET /backtest/report serves it. Sentiment/fundamentals held neutral (no point-in-time history) — calibrates the price/S-R/probability machinery. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-15 20:14:07 +02:00
parent 6d951bd760
commit 6df67ad7ae
7 changed files with 548 additions and 12 deletions
@@ -6,6 +6,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
 from app.dependencies import get_db, require_access
 from app.models.user import User
 from app.schemas.common import APIEnvelope
 from app.services.backtest_service import get_backtest_report
 from app.services.market_regime_service import get_market_regime
 router = APIRouter(tags=["market"])
@@ -19,3 +20,13 @@ async def market_regime(
    """Current benchmark (SPY) trend regime: bullish / bearish / neutral."""
    data = await get_market_regime(db)
    return APIEnvelope(status="success", data=data)
@router.get("/backtest/report", response_model=APIEnvelope)
 async def backtest_report(
    _user: User = Depends(require_access),
    db: AsyncSession = Depends(get_db),
 ) -> APIEnvelope:
    """Latest cached historical backtest report (None until the job runs)."""
    data = await get_backtest_report(db)
    return APIEnvelope(status="success", data=data)
@@ -35,6 +35,7 @@ from app.providers.fundamentals_chain import build_fundamental_provider_chain
 from app.providers.protocol import SentimentData
 from app.services import fundamental_service, ingestion_service, sentiment_service
 from app.services.alert_service import dispatch_alerts
 from app.services.backtest_service import run_and_store as run_backtest_and_store
 from app.services.market_regime_service import update_market_regime
 from app.services.outcome_service import evaluate_pending_setups
 from app.services.rr_scanner_service import scan_all_tickers
@@ -145,6 +146,17 @@ _job_runtime: dict[str, dict[str, object]] = {
        "finished_at": None,
        "message": None,
    },
    "backtest": {
        "running": False,
        "status": "idle",
        "processed": 0,
        "total": None,
        "progress_pct": None,
        "current_ticker": None,
        "started_at": None,
        "finished_at": None,
        "message": None,
    },
 }
@@ -851,6 +863,45 @@ async def compute_market_regime() -> None:
        }))
 # ---------------------------------------------------------------------------
 # Job: Backtest
 # ---------------------------------------------------------------------------
 async def run_backtest_job() -> None:
    """Replay the price-derived engine over history and cache the report."""
    job_name = "backtest"
    logger.info(json.dumps({"event": "job_start", "job": job_name}))
    _runtime_start(job_name)
    def _on_progress(done: int, count: int, symbol: str) -> None:
        _runtime_progress(job_name, processed=done, total=count, current_ticker=symbol or None)
    try:
        async with async_session_factory() as db:
            if not await _is_job_enabled(db, job_name):
                logger.info(json.dumps({"event": "job_skipped", "job": job_name, "reason": "disabled"}))
                _runtime_finish(job_name, "skipped", processed=0, total=0, message="Disabled")
                return
            report = await run_backtest_and_store(db, _on_progress)
        _runtime_finish(
            job_name, "completed",
            processed=report.get("tickers", 0), total=report.get("tickers", 0),
            message=f"{report.get('candidates', 0)} setups, {report.get('qualified', 0)} qualified",
        )
        logger.info(json.dumps({"event": "job_complete", "job": job_name, "candidates": report.get("candidates")}))
    except Exception as exc:
        _runtime_finish(job_name, "error", processed=0, total=None, message=str(exc))
        logger.error(json.dumps({
            "event": "job_error",
            "job": job_name,
            "error_type": type(exc).__name__,
            "message": str(exc),
        }))
 # ---------------------------------------------------------------------------
 # Job: Ticker Universe Sync
 # ---------------------------------------------------------------------------
@@ -1010,6 +1061,16 @@ def configure_scheduler() -> None:
        replace_existing=True,
    )
    # Backtest — weekly historical replay (expensive; mostly run on demand)
    scheduler.add_job(
        run_backtest_job,
        "interval",
        hours=168,
        id="backtest",
        name="Backtest",
        replace_existing=True,
    )
    logger.info(
        json.dumps({
            "event": "scheduler_configured",
@@ -484,6 +484,7 @@ VALID_JOB_NAMES = {
    "outcome_evaluator",
    "alerts",
    "market_regime",
    "backtest",
 }
 JOB_LABELS = {
@@ -495,6 +496,7 @@ JOB_LABELS = {
    "outcome_evaluator": "Outcome Evaluator",
    "alerts": "Alerts Dispatcher",
    "market_regime": "Market Regime",
    "backtest": "Backtest",
 }
@@ -0,0 +1,322 @@
 """Historical backtest (Phase 1): replay the price-derived engine over stored
 OHLCV and measure how the CURRENT config would have performed.
 For each ticker we step through history (weekly), and at each as-of date D we
 rebuild the setup using only bars ≤ D (no lookahead), then walk the actual bars
 after D to record the realized outcome. Two reports come out:
  - realized hit-rate / expectancy of qualified setups (and of all setups)
  - a probability calibration curve: do "60% likely" targets hit ~60% of the time?
 Limitation: sentiment and fundamentals have no point-in-time history, so they're
 held neutral here — this calibrates the price/S-R/probability machinery only.
 """
 from __future__ import annotations
 import json
 import logging
 from collections.abc import Callable
 from datetime import datetime, timezone
 from types import SimpleNamespace
 from typing import Any
 from sqlalchemy import select
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.models.settings import SystemSetting
 from app.models.ticker import Ticker
 from app.services.admin_service import get_activation_config, update_setting
 from app.services.indicator_service import _extract_ohlcv, compute_atr
 from app.services.outcome_service import (
    OUTCOME_AMBIGUOUS,
    OUTCOME_STOP_HIT,
    OUTCOME_TARGET_HIT,
    Bar,
    evaluate_setup_against_bars,
 )
 from app.services.price_service import query_ohlcv
 from app.services.qualification import best_target_probability, setup_qualifies
 from app.services.recommendation_service import (
    _choose_recommended_action,
    _classify_by_probability,
    _risk_level_from_conflicts,
    _select_primary_target,
    _zone_representative_levels,
    direction_analyzer,
    get_recommendation_config,
    probability_estimator,
    signal_conflict_detector,
    target_generator,
 )
 from app.services.scoring_service import (
    compute_momentum_from_closes,
    compute_technical_from_arrays,
 )
 from app.services.sr_service import detect_sr_levels
 logger = logging.getLogger(__name__)
 KEY_REPORT = "backtest_report"
 STEP_DAYS = 5          # weekly cadence (≈ 5 trading days)
 MIN_LOOKBACK = 60      # bars needed before D for indicators (EMA cross needs 51)
 HORIZON = 30           # trading days to resolve an outcome (matches the evaluator)
 ATR_MULTIPLIER = 1.5
 _CAL_BUCKETS = [(0, 20), (20, 40), (40, 60), (60, 80), (80, 100.01)]
 def _wrap_levels(level_dicts: list[dict]) -> list[Any]:
    return [
        SimpleNamespace(
            id=i,
            price_level=float(d["price_level"]),
            type=d["type"],
            strength=int(d["strength"]),
        )
        for i, d in enumerate(level_dicts)
    ]
 def _window_setups(
    window_records: list,
    config: dict,
    activation: dict,
 ) -> list[dict]:
    """Rebuild the setup(s) at the last bar of ``window_records`` (the as-of date),
    using only those bars. Returns one dict per tradeable direction."""
    if len(window_records) < MIN_LOOKBACK:
        return []
    _, highs, lows, closes, volumes = _extract_ohlcv(window_records)
    entry = closes[-1]
    if entry <= 0:
        return []
    try:
        atr = compute_atr(highs, lows, closes)["atr"]
    except Exception:
        return []
    if atr <= 0:
        return []
    sr_levels = _wrap_levels(detect_sr_levels(highs, lows, closes, volumes))
    if not sr_levels:
        return []
    technical = (compute_technical_from_arrays(highs, lows, closes, volumes)[0]) or 50.0
    momentum = (compute_momentum_from_closes(closes)[0]) or 50.0
    dim_scores = {"technical": technical, "momentum": momentum}
    conflicts = signal_conflict_detector.detect_conflicts(dim_scores, None, config)
    confidences = {
        "long": direction_analyzer.calculate_confidence("long", dim_scores, None, conflicts),
        "short": direction_analyzer.calculate_confidence("short", dim_scores, None, conflicts),
    }
    # First pass: build targets per direction
    per_dir: dict[str, dict] = {}
    for direction in ("long", "short"):
        stop = entry - atr * ATR_MULTIPLIER if direction == "long" else entry + atr * ATR_MULTIPLIER
        zone_levels = _zone_representative_levels(sr_levels, entry)
        targets = target_generator.generate_targets(direction, entry, stop, zone_levels, atr)
        if not targets:
            continue
        for t in targets:
            t["probability"] = probability_estimator.estimate_probability(
                t, dim_scores, None, direction, config
            )
            t["classification"] = _classify_by_probability(t["probability"])
        primary = _select_primary_target(targets)
        if primary is None:
            continue
        per_dir[direction] = {"stop": stop, "targets": targets, "primary": primary}
    available = set(per_dir.keys())
    if not available:
        return []
    action = _choose_recommended_action(confidences["long"], confidences["short"], config, available)
    out: list[dict] = []
    for direction, data in per_dir.items():
        targets, primary, stop = data["targets"], data["primary"], data["stop"]
        setup_conflicts = list(conflicts)
        if len(targets) < 3:
            setup_conflicts.append("target-availability: Fewer than 3 valid S/R targets available")
        risk_level = _risk_level_from_conflicts(setup_conflicts)
        rr = float(primary["rr_ratio"])
        target_price = float(primary["price"])
        setup_ns = SimpleNamespace(
            rr_ratio=rr,
            confidence_score=confidences[direction],
            recommended_action=action,
            risk_level=risk_level,
            targets=targets,
            direction=direction,
            target=target_price,
            stop_loss=stop,
            entry_price=entry,
        )
        out.append({
            "direction": direction,
            "entry": entry,
            "stop": stop,
            "target": target_price,
            "rr": rr,
            "confidence": confidences[direction],
            "primary_prob": float(primary["probability"]),
            "best_prob": best_target_probability(setup_ns),
            "action": action,
            "risk_level": risk_level,
            "qualified": setup_qualifies(setup_ns, activation),
        })
    return out
 def _replay_ticker(symbol: str, records: list, config: dict, activation: dict) -> list[dict]:
    """Walk one ticker's history weekly, building setups and their realized outcomes."""
    candidates: list[dict] = []
    n = len(records)
    if n < MIN_LOOKBACK + HORIZON:
        return candidates
    for i in range(MIN_LOOKBACK - 1, n - HORIZON, STEP_DAYS):
        window = records[: i + 1]
        forward = records[i + 1 :]
        forward_bars = [Bar(date=r.date, high=r.high, low=r.low) for r in forward]
        for s in _window_setups(window, config, activation):
            outcome, _ = evaluate_setup_against_bars(
                s["direction"], s["stop"], s["target"], forward_bars, HORIZON
            )
            if outcome is None:
                continue
            target_hit = outcome == OUTCOME_TARGET_HIT
            if outcome == OUTCOME_TARGET_HIT:
                realized_r = s["rr"]
            elif outcome in (OUTCOME_STOP_HIT, OUTCOME_AMBIGUOUS):
                realized_r = -1.0
            else:  # expired
                realized_r = 0.0
            candidates.append({
                "symbol": symbol,
                "date": records[i].date.isoformat(),
                "direction": s["direction"],
                "rr": s["rr"],
                "confidence": s["confidence"],
                "primary_prob": s["primary_prob"],
                "qualified": s["qualified"],
                "outcome": outcome,
                "target_hit": target_hit,
                "realized_r": realized_r,
            })
    return candidates
 def _bucket_stats(cands: list[dict]) -> dict:
    wins = sum(1 for c in cands if c["target_hit"])
    losses = sum(1 for c in cands if c["outcome"] in (OUTCOME_STOP_HIT, OUTCOME_AMBIGUOUS))
    expired = sum(1 for c in cands if c["outcome"] not in (OUTCOME_TARGET_HIT, OUTCOME_STOP_HIT, OUTCOME_AMBIGUOUS))
    decided = wins + losses
    rs = [c["realized_r"] for c in cands]
    return {
        "total": len(cands),
        "wins": wins,
        "losses": losses,
        "expired": expired,
        "hit_rate": round(wins / decided * 100, 1) if decided else None,
        "avg_r": round(sum(rs) / len(rs), 3) if rs else None,
        "total_r": round(sum(rs), 2) if rs else None,
    }
 def _calibration(cands: list[dict]) -> list[dict]:
    """Predicted target probability vs realized hit rate, per probability bucket."""
    rows: list[dict] = []
    for lo, hi in _CAL_BUCKETS:
        bucket = [c for c in cands if lo <= c["primary_prob"] < hi]
        if not bucket:
            continue
        hits = sum(1 for c in bucket if c["target_hit"])
        rows.append({
            "bucket": f"{int(lo)}-{int(min(hi, 100))}%",
            "n": len(bucket),
            "predicted_avg": round(sum(c["primary_prob"] for c in bucket) / len(bucket), 1),
            "realized_hit_rate": round(hits / len(bucket) * 100, 1),
        })
    return rows
 async def run_backtest(
    db: AsyncSession,
    progress_cb: Callable[[int, int, str], None] | None = None,
 ) -> dict:
    """Replay every ticker and aggregate the Phase-1 reports for the current config."""
    config = await get_recommendation_config(db)
    activation = await get_activation_config(db)
    result = await db.execute(select(Ticker).order_by(Ticker.symbol))
    tickers = list(result.scalars().all())
    total = len(tickers)
    candidates: list[dict] = []
    for index, ticker in enumerate(tickers):
        if progress_cb is not None:
            progress_cb(index, total, ticker.symbol)
        try:
            records = await query_ohlcv(db, ticker.symbol)
            candidates.extend(_replay_ticker(ticker.symbol, records, config, activation))
        except Exception:
            logger.exception("Backtest replay failed for %s", ticker.symbol)
    if progress_cb is not None and total:
        progress_cb(total, total, "")
    qualified = [c for c in candidates if c["qualified"]]
    longs = [c for c in qualified if c["direction"] == "long"]
    shorts = [c for c in qualified if c["direction"] == "short"]
    return {
        "generated_at": datetime.now(timezone.utc).isoformat(),
        "tickers": total,
        "candidates": len(candidates),
        "qualified": len(qualified),
        "params": {"step_days": STEP_DAYS, "horizon_days": HORIZON, "min_lookback": MIN_LOOKBACK},
        "activation": activation,
        "overall_qualified": _bucket_stats(qualified),
        "overall_all": _bucket_stats(candidates),
        "by_direction": {
            "long": _bucket_stats(longs),
            "short": _bucket_stats(shorts),
        },
        "calibration": _calibration(candidates),
        "note": (
            "Sentiment & fundamentals held neutral (no point-in-time history). "
            "~6 months ≈ one market regime — treat as directional, not gospel."
        ),
    }
 async def run_and_store(
    db: AsyncSession,
    progress_cb: Callable[[int, int, str], None] | None = None,
 ) -> dict:
    """Run the backtest and cache the report in a SystemSetting. Job entrypoint."""
    report = await run_backtest(db, progress_cb)
    await update_setting(db, KEY_REPORT, json.dumps(report))
    return report
 async def get_backtest_report(db: AsyncSession) -> dict | None:
    """Return the last cached backtest report, or None if never run."""
    result = await db.execute(select(SystemSetting).where(SystemSetting.key == KEY_REPORT))
    setting = result.scalar_one_or_none()
    if setting is None:
        return None
    try:
        return json.loads(setting.value)
    except (TypeError, ValueError):
        return None
@@ -88,11 +88,28 @@ async def _save_weights(db: AsyncSession, weights: dict[str, float]) -> None:
 async def _compute_technical_score(
    db: AsyncSession, symbol: str
 ) -> tuple[float | None, dict | None]:
-    """Compute technical dimension score from ADX, EMA, RSI, EMA Cross,
+    """Compute technical dimension score from stored OHLCV (DB wrapper)."""
-    Volume Profile and Pivot Points.
+    from app.services.indicator_service import _extract_ohlcv
    from app.services.price_service import query_ohlcv
-    Returns (score, breakdown) where breakdown follows the ScoreBreakdown
+    records = await query_ohlcv(db, symbol)
-    TypedDict shape: {sub_scores, formula, unavailable}.
+    if not records:
        return None, None
    _, highs, lows, closes, volumes = _extract_ohlcv(records)
    return compute_technical_from_arrays(highs, lows, closes, volumes)
 def compute_technical_from_arrays(
    highs: list[float],
    lows: list[float],
    closes: list[float],
    volumes: list[int],
 ) -> tuple[float | None, dict | None]:
    """Technical score from raw OHLCV arrays — ADX, EMA, RSI, EMA Cross, Volume
    Profile, Pivot Points. Pure (no DB) so the backtest can compute it as-of-date.
    Returns (score, breakdown).
    """
    from app.services.indicator_service import (
        compute_adx,
@@ -101,16 +118,11 @@ async def _compute_technical_score(
        compute_pivot_points,
        compute_rsi,
        compute_volume_profile,
        _extract_ohlcv,
    )
    from app.services.price_service import query_ohlcv
-    records = await query_ohlcv(db, symbol)
+    if not closes:
    if not records:
        return None, None
    _, highs, lows, closes, volumes = _extract_ohlcv(records)
    formula = (
        "Weighted average: 0.30*ADX + 0.20*EMA + 0.20*RSI + 0.15*EMA_Cross "
        "+ 0.10*Volume_Profile + 0.05*Pivot_Points, re-normalized if any "
@@ -514,13 +526,21 @@ async def _compute_momentum_score(
    """
    from app.services.price_service import query_ohlcv
    formula = "Weighted average: 0.5 * ROC_5 + 0.5 * ROC_20, re-normalized if any sub-score unavailable."
    records = await query_ohlcv(db, symbol)
    if not records or len(records) < 6:
        return None, None
    closes = [float(r.close) for r in records]
    return compute_momentum_from_closes(closes)
 def compute_momentum_from_closes(closes: list[float]) -> tuple[float | None, dict | None]:
    """Momentum score (5- and 20-day ROC) from a close series. Pure (no DB)."""
    formula = "Weighted average: 0.5 * ROC_5 + 0.5 * ROC_20, re-normalized if any sub-score unavailable."
    if not closes or len(closes) < 6:
        return None, None
    latest = closes[-1]
    scores: list[tuple[float, float]] = []  # (weight, score)
@@ -0,0 +1,118 @@
 """Tests for the historical backtest harness."""
 from __future__ import annotations
 import math
 from datetime import date, timedelta
 import pytest
 from app.models.ohlcv import OHLCVRecord
 from app.models.ticker import Ticker
 from app.services import backtest_service as bt
 from app.services.outcome_service import (
    OUTCOME_EXPIRED,
    OUTCOME_STOP_HIT,
    OUTCOME_TARGET_HIT,
 )
 from tests.conftest import _test_session_factory  # type: ignore
@pytest.fixture
 async def session():
    async with _test_session_factory() as s:
        yield s
 def _cand(prob: float, outcome: str, rr: float, qualified: bool = True, direction: str = "long") -> dict:
    target_hit = outcome == OUTCOME_TARGET_HIT
    realized = rr if target_hit else (0.0 if outcome == OUTCOME_EXPIRED else -1.0)
    return {
        "primary_prob": prob,
        "outcome": outcome,
        "target_hit": target_hit,
        "rr": rr,
        "realized_r": realized,
        "qualified": qualified,
        "direction": direction,
    }
 def test_bucket_stats_counts_and_expectancy():
    cands = [
        _cand(70, OUTCOME_TARGET_HIT, 3.0),   # +3R win
        _cand(60, OUTCOME_TARGET_HIT, 2.0),   # +2R win
        _cand(40, OUTCOME_STOP_HIT, 3.0),     # -1R loss
        _cand(30, OUTCOME_EXPIRED, 3.0),      # 0R expired
    ]
    s = bt._bucket_stats(cands)
    assert s["total"] == 4
    assert s["wins"] == 2
    assert s["losses"] == 1
    assert s["expired"] == 1
    # hit rate is over decided (wins+losses) only
    assert s["hit_rate"] == round(2 / 3 * 100, 1)
    # avg R = (3 + 2 - 1 + 0) / 4 = 1.0
    assert s["avg_r"] == 1.0
    assert s["total_r"] == 4.0
 def test_bucket_stats_empty():
    s = bt._bucket_stats([])
    assert s["total"] == 0
    assert s["hit_rate"] is None
    assert s["avg_r"] is None
 def test_calibration_buckets():
    cands = [
        _cand(65, OUTCOME_TARGET_HIT, 2.0),
        _cand(62, OUTCOME_STOP_HIT, 2.0),
        _cand(15, OUTCOME_STOP_HIT, 2.0),
    ]
    rows = bt._calibration(cands)
    by_bucket = {r["bucket"]: r for r in rows}
    assert by_bucket["60-80%"]["n"] == 2
    assert by_bucket["60-80%"]["realized_hit_rate"] == 50.0  # 1 of 2 hit
    assert by_bucket["0-20%"]["n"] == 1
    assert by_bucket["0-20%"]["realized_hit_rate"] == 0.0
 def test_window_setups_too_short_returns_empty():
    assert bt._window_setups([], {}, {}) == []
 async def _seed_oscillating_ticker(session, symbol: str, n: int = 160) -> None:
    t = Ticker(symbol=symbol)
    session.add(t)
    await session.flush()
    base = date(2025, 1, 1)
    for i in range(n):
        close = 100.0 + 8.0 * math.sin(i / 6.0)
        session.add(OHLCVRecord(
            ticker_id=t.id,
            date=base + timedelta(days=i),
            open=close,
            high=close + 1.5,
            low=close - 1.5,
            close=close,
            volume=1_000_000 + (i % 5) * 1000,
        ))
    await session.commit()
 async def test_run_backtest_smoke(session):
    await _seed_oscillating_ticker(session, "OSC")
    report = await bt.run_backtest(session)
    # well-formed report
    assert report["tickers"] == 1
    assert isinstance(report["candidates"], int)
    for key in ("overall_qualified", "overall_all", "by_direction", "calibration"):
        assert key in report
    # the oscillating series should yield at least some resolved setups
    assert report["candidates"] >= 1
    # every calibration row is internally consistent
    for row in report["calibration"]:
        assert 0 <= row["realized_hit_rate"] <= 100
        assert row["n"] >= 1
@@ -83,6 +83,7 @@ class TestConfigureScheduler:
            "outcome_evaluator",
            "alerts",
            "market_regime",
            "backtest",
        }
    def test_configure_is_idempotent(self):
@@ -93,6 +94,7 @@ class TestConfigureScheduler:
        # Each ID should appear exactly once
        assert sorted(job_ids) == sorted([
            "alerts",
            "backtest",
            "data_collector",
            "fundamental_collector",
            "market_regime",