Files
signal-platform/tests/unit/test_backtest_service.py
T
dennisthiessen 942a22ce65
Deploy / lint (push) Successful in 6s
Deploy / test (push) Successful in 55s
Deploy / deploy (push) Successful in 33s
feat: grade gate-ablation variants under the hold-to-horizon exit too
The ablation judged floors under the target/stop model, but the exit
sweeps point at replacing that exit with a fixed hold — under which the
R:R floor's rationale (bigger payoff at the target) may not apply. Each
ablation row now also carries hold_avg_r / hold_net_avg_r / hold_total_r
(30d hold, initial stop only), so the Phase 3 gate decision can be read
under the exit policy that would actually be used.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-07-02 11:34:41 +02:00

405 lines
15 KiB
Python

"""Tests for the historical backtest harness."""
from __future__ import annotations
import math
from datetime import date, timedelta
from types import SimpleNamespace
import pytest
from app.models.ohlcv import OHLCVRecord
from app.models.ticker import Ticker
from app.services import backtest_service as bt
from app.services.outcome_service import (
OUTCOME_EXPIRED,
OUTCOME_STOP_HIT,
OUTCOME_TARGET_HIT,
)
from tests.conftest import _test_session_factory # type: ignore
@pytest.fixture
async def session():
async with _test_session_factory() as s:
yield s
def _cand(
prob: float,
outcome: str,
rr: float,
qualified: bool = True,
direction: str = "long",
risk_pct: float = 0.05,
) -> dict:
target_hit = outcome == OUTCOME_TARGET_HIT
realized = rr if target_hit else (0.0 if outcome == OUTCOME_EXPIRED else -1.0)
return {
"primary_prob": prob,
"outcome": outcome,
"target_hit": target_hit,
"rr": rr,
"realized_r": realized,
"qualified": qualified,
"direction": direction,
"risk_pct": risk_pct,
}
# Round-trip cost in R for the default _cand risk_pct: 2 * 0.001 / 0.05 = 0.04R.
_COST_R_005 = 2 * bt.COST_PER_SIDE / 0.05
def _bar(high: float, low: float, close: float) -> SimpleNamespace:
return SimpleNamespace(high=high, low=low, close=close)
class TestTakeProfitPrimitives:
def test_long_tp_reachable_before_stop(self):
risk, stopped, mfe, close_pct = bt._tp_primitives("long", 100.0, 95.0, [_bar(109, 101, 108)], 30)
assert risk == pytest.approx(0.05)
assert stopped is False
assert mfe == pytest.approx(0.09)
assert close_pct == pytest.approx(0.08)
def test_long_stop_zeroes_mfe(self):
# Low pierces the stop on the only bar → loss, nothing banked before it.
risk, stopped, mfe, close_pct = bt._tp_primitives("long", 100.0, 95.0, [_bar(101, 94, 96)], 30)
assert stopped is True
assert mfe == pytest.approx(0.0)
assert close_pct == pytest.approx(-0.04)
def test_long_drift_no_trigger(self):
bars = [_bar(102, 99, 101), _bar(103, 100, 102)]
risk, stopped, mfe, close_pct = bt._tp_primitives("long", 100.0, 95.0, bars, 30)
assert stopped is False
assert mfe == pytest.approx(0.03)
assert close_pct == pytest.approx(0.02)
def test_short_direction(self):
# short entry 100, stop 105; price falls → favourable = (entry - low)/entry
risk, stopped, mfe, close_pct = bt._tp_primitives("short", 100.0, 105.0, [_bar(101, 92, 93)], 30)
assert risk == pytest.approx(0.05)
assert stopped is False
assert mfe == pytest.approx(0.08)
assert close_pct == pytest.approx(0.07)
class TestTakeProfitBucket:
def test_bucket_mix(self):
cands = [
{"risk_pct": 0.05, "mfe_pct": 0.09, "tp_stopped": False, "tp_close_pct": 0.08}, # +1.6R win
{"risk_pct": 0.05, "mfe_pct": 0.02, "tp_stopped": True, "tp_close_pct": -0.04}, # -1R stop
{"risk_pct": 0.05, "mfe_pct": 0.03, "tp_stopped": False, "tp_close_pct": 0.01}, # +0.2R timeout
]
b = bt._take_profit_bucket(cands, 0.08)
assert b["total"] == 3
assert b["wins"] == 1
assert b["hit_rate"] == pytest.approx(33.3, abs=0.1)
assert b["total_r"] == pytest.approx(0.8, abs=0.01)
assert b["avg_r"] == pytest.approx(0.267, abs=0.01)
# net: minus a 0.04R round trip per candidate (risk_pct 0.05)
assert b["net_total_r"] == pytest.approx(0.8 - 3 * _COST_R_005, abs=0.01)
assert b["net_avg_r"] == pytest.approx((0.8 - 3 * _COST_R_005) / 3, abs=0.01)
def test_zero_risk_skipped(self):
cands = [{"risk_pct": 0.0, "mfe_pct": 0.2, "tp_stopped": False, "tp_close_pct": 0.1}]
b = bt._take_profit_bucket(cands, 0.08)
assert b["total"] == 0
assert b["avg_r"] is None
class TestTrailingExits:
def test_locks_gain_on_pullback(self):
# Runs to 120, then a 10% trail (from peak 120 → 108) is pierced on the drop.
res = bt._trailing_exits("long", 100.0, 90.0, (0.10,), [_bar(120, 110, 118), _bar(130, 100, 105)], 30)
assert res[10] == pytest.approx(0.8) # (108-100)/100 / 0.10 risk
def test_initial_stop_caps_loss(self):
# Trail (20%) is looser than the initial stop → initial stop governs = -1R.
res = bt._trailing_exits("long", 100.0, 90.0, (0.20,), [_bar(101, 89, 90)], 30)
assert res[20] == pytest.approx(-1.0)
def test_timeout_exits_at_close(self):
res = bt._trailing_exits("long", 100.0, 90.0, (0.20,), [_bar(105, 98, 104), _bar(106, 100, 105)], 30)
assert res[20] == pytest.approx(0.5) # close 105 → +5% / 10% risk
def test_multiple_widths_one_pass(self):
# Tighter trail locks in more here (exit at 114 vs 108).
res = bt._trailing_exits("long", 100.0, 90.0, (0.10, 0.05), [_bar(120, 110, 118), _bar(130, 100, 105)], 30)
assert res[10] == pytest.approx(0.8)
assert res[5] == pytest.approx(1.4)
class TestTrailingBucket:
def test_bucket(self):
cands = [
{"trail_r": {5: 1.4, 10: 0.8}, "risk_pct": 0.10},
{"trail_r": {5: -1.0, 10: -1.0}, "risk_pct": 0.10},
{"trail_r": {5: 0.5, 10: 0.5}, "risk_pct": 0.10},
]
b = bt._trailing_bucket(cands, 5)
assert b["total"] == 3
assert b["wins"] == 2
assert b["win_rate"] == pytest.approx(66.7, abs=0.1)
assert b["total_r"] == pytest.approx(0.9, abs=0.01)
assert b["avg_r"] == pytest.approx(0.3, abs=0.01)
# net: 0.02R round trip per candidate (risk_pct 0.10)
assert b["net_total_r"] == pytest.approx(0.9 - 3 * 0.02, abs=0.01)
assert b["net_avg_r"] == pytest.approx(0.28, abs=0.01)
class TestTimeExits:
def test_long_exits_at_horizon_close(self):
bars = [_bar(103, 99, 102), _bar(105, 101, 104), _bar(107, 103, 106)]
res = bt._time_exits("long", 100.0, 95.0, bars, (2, 5))
assert res[2] == pytest.approx(0.8) # close 104 → +4% / 5% risk
assert res[5] == pytest.approx(1.2) # only 3 bars → last close 106
def test_stop_on_first_bar_loses_everywhere(self):
res = bt._time_exits("long", 100.0, 95.0, [_bar(101, 94, 96), _bar(105, 101, 104)], (1, 5))
assert res[1] == pytest.approx(-1.0)
assert res[5] == pytest.approx(-1.0)
def test_stop_after_short_horizon_only_hits_long_hold(self):
# Day-2 close banked by the 2-day hold; the stop on day 3 only hits n=5.
bars = [_bar(103, 99, 102), _bar(104, 100, 103), _bar(101, 94, 95)]
res = bt._time_exits("long", 100.0, 95.0, bars, (2, 5))
assert res[2] == pytest.approx(0.6) # close 103 → +3% / 5% risk
assert res[5] == pytest.approx(-1.0)
def test_short_direction(self):
res = bt._time_exits("short", 100.0, 105.0, [_bar(101, 95, 96)], (1,))
assert res[1] == pytest.approx(0.8) # close 96 → +4% / 5% risk
def test_zero_risk_returns_zero(self):
res = bt._time_exits("long", 100.0, 100.0, [_bar(103, 99, 102)], (5,))
assert res[5] == 0.0
class TestTimeExitBucket:
def test_bucket(self):
cands = [
{"time_r": {5: 1.4, 21: 0.8}, "risk_pct": 0.10},
{"time_r": {5: -1.0, 21: -1.0}, "risk_pct": 0.10},
{"time_r": {5: 0.5, 21: 0.5}, "risk_pct": 0.10},
]
b = bt._time_exit_bucket(cands, 5)
assert b["hold_days"] == 5
assert b["total"] == 3
assert b["wins"] == 2
assert b["win_rate"] == pytest.approx(66.7, abs=0.1)
assert b["avg_r"] == pytest.approx(0.3, abs=0.01)
assert b["net_avg_r"] == pytest.approx(0.28, abs=0.01)
def test_missing_hold_skipped(self):
b = bt._time_exit_bucket([{"time_r": {5: 1.0}}], 21)
assert b["total"] == 0
assert b["avg_r"] is None
def _acand(
rr: float = 2.0,
conf: float = 60.0,
action: str = "LONG_MODERATE",
mp: float | None = 90.0,
direction: str = "long",
) -> dict:
"""Ablation candidate: meets_core mirrors the default floors (min_rr 1.2,
min_confidence 55, exclude_neutral on)."""
meets = rr >= 1.2 and conf >= 55.0 and action != "NEUTRAL"
return {
"rr": rr,
"confidence": conf,
"action": action,
"momentum_percentile": mp,
"direction": direction,
"meets_core": meets,
"risk_level": "Low",
"target_hit": True,
"outcome": OUTCOME_TARGET_HIT,
"realized_r": rr,
"risk_pct": 0.05,
"time_r": {d: 0.5 for d in bt.TIME_EXIT_DAYS},
}
class TestGateAblation:
ACTIVATION = {
"min_rr": 1.2,
"min_confidence": 55.0,
"exclude_neutral": True,
"require_high_conviction": False,
"exclude_conflicts": False,
}
def test_variant_counts(self):
cands = [
_acand(), # clears everything
_acand(conf=40.0), # fails confidence floor
_acand(rr=1.0), # fails R:R floor
_acand(action="NEUTRAL"), # fails NEUTRAL exclusion
_acand(mp=50.0), # fails the momentum cutoff
_acand(direction="short", mp=95.0), # short — gated out
]
rows = {r["variant"]: r for r in bt._gate_ablation(cands, self.ACTIVATION, 80.0)}
assert rows["all_floors"]["total"] == 1
assert rows["no_confidence_floor"]["total"] == 2
assert rows["no_rr_floor"]["total"] == 2
assert rows["no_neutral_exclusion"]["total"] == 2
assert rows["momentum_only"]["total"] == 4
assert rows["all_floors"]["net_avg_r"] is not None
# Every variant is also graded under the hold-to-horizon exit.
assert rows["all_floors"]["hold_days"] == max(bt.TIME_EXIT_DAYS)
assert rows["all_floors"]["hold_avg_r"] == pytest.approx(0.5)
assert rows["all_floors"]["hold_net_avg_r"] is not None
assert rows["momentum_only"]["hold_total_r"] == pytest.approx(4 * 0.5, abs=0.01)
def test_threshold_zero_disables_momentum_gate(self):
# Floors only: the short and the low-momentum long both pass all_floors.
cands = [_acand(mp=50.0), _acand(direction="short", mp=None)]
rows = {r["variant"]: r for r in bt._gate_ablation(cands, self.ACTIVATION, 0.0)}
assert rows["all_floors"]["total"] == 2
def test_bucket_stats_counts_and_expectancy():
cands = [
_cand(70, OUTCOME_TARGET_HIT, 3.0), # +3R win
_cand(60, OUTCOME_TARGET_HIT, 2.0), # +2R win
_cand(40, OUTCOME_STOP_HIT, 3.0), # -1R loss
_cand(30, OUTCOME_EXPIRED, 3.0), # 0R expired
]
s = bt._bucket_stats(cands)
assert s["total"] == 4
assert s["wins"] == 2
assert s["losses"] == 1
assert s["expired"] == 1
# hit rate is over decided (wins+losses) only
assert s["hit_rate"] == round(2 / 3 * 100, 1)
# avg R = (3 + 2 - 1 + 0) / 4 = 1.0
assert s["avg_r"] == 1.0
assert s["total_r"] == 4.0
# net = gross minus a 0.04R round trip per candidate (risk_pct 0.05)
assert s["net_avg_r"] == pytest.approx(1.0 - _COST_R_005, abs=0.001)
assert s["net_total_r"] == pytest.approx(4.0 - 4 * _COST_R_005, abs=0.01)
def test_bucket_stats_empty():
s = bt._bucket_stats([])
assert s["total"] == 0
assert s["hit_rate"] is None
assert s["avg_r"] is None
assert s["net_avg_r"] is None
def test_bucket_stats_no_risk_pct_means_no_cost():
c = _cand(50, OUTCOME_TARGET_HIT, 2.0)
del c["risk_pct"]
s = bt._bucket_stats([c])
assert s["net_avg_r"] == s["avg_r"]
assert s["net_total_r"] == s["total_r"]
def test_calibration_buckets():
cands = [
_cand(65, OUTCOME_TARGET_HIT, 2.0),
_cand(62, OUTCOME_STOP_HIT, 2.0),
_cand(15, OUTCOME_STOP_HIT, 2.0),
]
rows = bt._calibration(cands)
by_bucket = {r["bucket"]: r for r in rows}
assert by_bucket["60-80%"]["n"] == 2
assert by_bucket["60-80%"]["realized_hit_rate"] == 50.0 # 1 of 2 hit
assert by_bucket["0-20%"]["n"] == 1
assert by_bucket["0-20%"]["realized_hit_rate"] == 0.0
def test_window_setups_too_short_returns_empty():
assert bt._window_setups([], {}, {}) == []
def test_replay_ticker_candidates_carry_gate_fields():
"""The ablation recomputes floors from candidate fields — a candidate missing
action/risk_level silently zeroes the ablation rows (July 2026 regression)."""
from app.services.admin_service import ACTIVATION_DEFAULTS
from app.services.recommendation_service import DEFAULT_RECOMMENDATION_CONFIG
base = date(2025, 1, 1)
bars = []
for i in range(160):
close = 100.0 + 8.0 * math.sin(i / 6.0)
bars.append(SimpleNamespace(
date=base + timedelta(days=i),
open=close,
high=close + 1.5,
low=close - 1.5,
close=close,
volume=1_000_000 + (i % 5) * 1000,
))
cands = bt._replay_ticker(
"OSC", bars, dict(DEFAULT_RECOMMENDATION_CONFIG), dict(ACTIVATION_DEFAULTS)
)
assert cands, "expected the oscillating series to produce candidates"
for c in cands:
assert c.get("action") is not None
assert "risk_level" in c
async def _seed_oscillating_ticker(session, symbol: str, n: int = 160) -> None:
t = Ticker(symbol=symbol)
session.add(t)
await session.flush()
base = date(2025, 1, 1)
for i in range(n):
close = 100.0 + 8.0 * math.sin(i / 6.0)
session.add(OHLCVRecord(
ticker_id=t.id,
date=base + timedelta(days=i),
open=close,
high=close + 1.5,
low=close - 1.5,
close=close,
volume=1_000_000 + (i % 5) * 1000,
))
await session.commit()
async def test_run_backtest_smoke(session):
await _seed_oscillating_ticker(session, "OSC")
report = await bt.run_backtest(session)
# well-formed report
assert report["tickers"] == 1
assert isinstance(report["candidates"], int)
for key in (
"overall_qualified", "overall_all", "by_direction", "calibration", "sweep",
"gate_ablation", "time_exit_sweep",
):
assert key in report
# the oscillating series should yield at least some resolved setups
assert report["candidates"] >= 1
# cost assumption is reported, and every bucket carries net numbers
assert report["params"]["cost_per_side_pct"] == pytest.approx(bt.COST_PER_SIDE * 100)
assert "net_avg_r" in report["overall_all"]
# ablation baseline reproduces the qualified set exactly, and every row
# carries the hold-to-horizon grading alongside the target model
ablation = {r["variant"]: r for r in report["gate_ablation"]}
assert ablation["all_floors"]["total"] == report["overall_qualified"]["total"]
for row in report["gate_ablation"]:
assert "hold_net_avg_r" in row
# time-exit sweep covers the configured hold lengths
assert [r["hold_days"] for r in report["time_exit_sweep"]] == list(bt.TIME_EXIT_DAYS)
# sweep: lowering the momentum-percentile cutoff can only add qualifiers
sweep = sorted(report["sweep"], key=lambda r: r["min_momentum_percentile"], reverse=True)
counts = [r["total"] for r in sweep]
assert counts == sorted(counts) # ascending as threshold descends
# every calibration row is internally consistent
for row in report["calibration"]:
assert 0 <= row["realized_hit_rate"] <= 100
assert row["n"] >= 1