add backtest harness (Phase 1): historical replay + hit-rate & calibration reports

Replays the price-derived engine over stored OHLCV: at each weekly as-of date, rebuild the setup from bars <= D (no lookahead) and walk the actual forward bars for the realized outcome. Reports realized hit-rate/expectancy of qualified setups (and all setups, by direction) plus a probability calibration curve (predicted target prob vs realized hit rate). Reuses pure functions throughout; extracted compute_technical_from_arrays / compute_momentum_from_closes from scoring_service so live and backtest stay in sync. Runs as a weekly/triggerable 'backtest' job caching the report in a SystemSetting; GET /backtest/report serves it. Sentiment/fundamentals held neutral (no point-in-time history) — calibrates the price/S-R/probability machinery. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-15 20:14:07 +02:00
parent 6d951bd760
commit 6df67ad7ae
7 changed files with 548 additions and 12 deletions
@@ -0,0 +1,118 @@
+"""Tests for the historical backtest harness."""
+
+from __future__ import annotations
+
+import math
+from datetime import date, timedelta
+
+import pytest
+
+from app.models.ohlcv import OHLCVRecord
+from app.models.ticker import Ticker
+from app.services import backtest_service as bt
+from app.services.outcome_service import (
+    OUTCOME_EXPIRED,
+    OUTCOME_STOP_HIT,
+    OUTCOME_TARGET_HIT,
+)
+from tests.conftest import _test_session_factory  # type: ignore
+
+
+@pytest.fixture
+async def session():
+    async with _test_session_factory() as s:
+        yield s
+
+
+def _cand(prob: float, outcome: str, rr: float, qualified: bool = True, direction: str = "long") -> dict:
+    target_hit = outcome == OUTCOME_TARGET_HIT
+    realized = rr if target_hit else (0.0 if outcome == OUTCOME_EXPIRED else -1.0)
+    return {
+        "primary_prob": prob,
+        "outcome": outcome,
+        "target_hit": target_hit,
+        "rr": rr,
+        "realized_r": realized,
+        "qualified": qualified,
+        "direction": direction,
+    }
+
+
+def test_bucket_stats_counts_and_expectancy():
+    cands = [
+        _cand(70, OUTCOME_TARGET_HIT, 3.0),   # +3R win
+        _cand(60, OUTCOME_TARGET_HIT, 2.0),   # +2R win
+        _cand(40, OUTCOME_STOP_HIT, 3.0),     # -1R loss
+        _cand(30, OUTCOME_EXPIRED, 3.0),      # 0R expired
+    ]
+    s = bt._bucket_stats(cands)
+    assert s["total"] == 4
+    assert s["wins"] == 2
+    assert s["losses"] == 1
+    assert s["expired"] == 1
+    # hit rate is over decided (wins+losses) only
+    assert s["hit_rate"] == round(2 / 3 * 100, 1)
+    # avg R = (3 + 2 - 1 + 0) / 4 = 1.0
+    assert s["avg_r"] == 1.0
+    assert s["total_r"] == 4.0
+
+
+def test_bucket_stats_empty():
+    s = bt._bucket_stats([])
+    assert s["total"] == 0
+    assert s["hit_rate"] is None
+    assert s["avg_r"] is None
+
+
+def test_calibration_buckets():
+    cands = [
+        _cand(65, OUTCOME_TARGET_HIT, 2.0),
+        _cand(62, OUTCOME_STOP_HIT, 2.0),
+        _cand(15, OUTCOME_STOP_HIT, 2.0),
+    ]
+    rows = bt._calibration(cands)
+    by_bucket = {r["bucket"]: r for r in rows}
+    assert by_bucket["60-80%"]["n"] == 2
+    assert by_bucket["60-80%"]["realized_hit_rate"] == 50.0  # 1 of 2 hit
+    assert by_bucket["0-20%"]["n"] == 1
+    assert by_bucket["0-20%"]["realized_hit_rate"] == 0.0
+
+
+def test_window_setups_too_short_returns_empty():
+    assert bt._window_setups([], {}, {}) == []
+
+
+async def _seed_oscillating_ticker(session, symbol: str, n: int = 160) -> None:
+    t = Ticker(symbol=symbol)
+    session.add(t)
+    await session.flush()
+    base = date(2025, 1, 1)
+    for i in range(n):
+        close = 100.0 + 8.0 * math.sin(i / 6.0)
+        session.add(OHLCVRecord(
+            ticker_id=t.id,
+            date=base + timedelta(days=i),
+            open=close,
+            high=close + 1.5,
+            low=close - 1.5,
+            close=close,
+            volume=1_000_000 + (i % 5) * 1000,
+        ))
+    await session.commit()
+
+
+async def test_run_backtest_smoke(session):
+    await _seed_oscillating_ticker(session, "OSC")
+    report = await bt.run_backtest(session)
+
+    # well-formed report
+    assert report["tickers"] == 1
+    assert isinstance(report["candidates"], int)
+    for key in ("overall_qualified", "overall_all", "by_direction", "calibration"):
+        assert key in report
+    # the oscillating series should yield at least some resolved setups
+    assert report["candidates"] >= 1
+    # every calibration row is internally consistent
+    for row in report["calibration"]:
+        assert 0 <= row["realized_hit_rate"] <= 100
+        assert row["n"] >= 1
@@ -83,6 +83,7 @@ class TestConfigureScheduler:
            "outcome_evaluator",
            "alerts",
            "market_regime",
+            "backtest",
        }

    def test_configure_is_idempotent(self):
@@ -93,6 +94,7 @@ class TestConfigureScheduler:
        # Each ID should appear exactly once
        assert sorted(job_ids) == sorted([
            "alerts",
+            "backtest",
            "data_collector",
            "fundamental_collector",
            "market_regime",