Files
signal-platform/tests/unit/test_backtest_service.py
T
dennisthiessen 0f43e755f4
Deploy / lint (push) Successful in 6s
Deploy / test (push) Successful in 55s
Deploy / deploy (push) Successful in 38s
feat: portfolio simulation + per-trade stats (gaps, hold time, best/worst)
Per-trade additions to the report:
- Gap-through-stop fills: stops now fill at the worse of the stop or the
  bar's open across every exit model (target, TP, trailing, time), so a
  loss can exceed -1R; targets never fill better than their level.
- best_r / worst_r, avg holding days, and net R per day of capital
  deployed on the summary buckets and the time-exit sweep.

Portfolio simulation (the stats a per-setup replay cannot give):
- One capital-constrained book over the qualified setups: 10k start, max
  10 concurrent positions (one per ticker, best momentum first), 1%
  fixed-fractional risk with a 20% no-leverage notional cap, entries at
  the detection close, 0.1%/side costs, daily mark-to-market.
- Two exit policies compared: S/R target race vs hold-to-horizon.
- Equity-curve stats: final equity, total return, CAGR, max drawdown,
  annualized daily Sharpe, win rate, avg P&L, best/worst trade, avg
  hold, entries skipped on a full book, and SPY price return over the
  same window (benchmark history refreshed to cover the replay span).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-07-02 11:56:29 +02:00

533 lines
21 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Tests for the historical backtest harness."""
from __future__ import annotations
import math
from datetime import date, timedelta
from types import SimpleNamespace
import pytest
from app.models.ohlcv import OHLCVRecord
from app.models.ticker import Ticker
from app.services import backtest_service as bt
from app.services.outcome_service import (
OUTCOME_EXPIRED,
OUTCOME_STOP_HIT,
OUTCOME_TARGET_HIT,
)
from tests.conftest import _test_session_factory # type: ignore
@pytest.fixture
async def session():
async with _test_session_factory() as s:
yield s
def _cand(
prob: float,
outcome: str,
rr: float,
qualified: bool = True,
direction: str = "long",
risk_pct: float = 0.05,
hold_days: int = 10,
) -> dict:
target_hit = outcome == OUTCOME_TARGET_HIT
realized = rr if target_hit else (0.0 if outcome == OUTCOME_EXPIRED else -1.0)
return {
"primary_prob": prob,
"outcome": outcome,
"target_hit": target_hit,
"rr": rr,
"realized_r": realized,
"qualified": qualified,
"direction": direction,
"risk_pct": risk_pct,
"hold_days": hold_days,
}
# Round-trip cost in R for the default _cand risk_pct: 2 * 0.001 / 0.05 = 0.04R.
_COST_R_005 = 2 * bt.COST_PER_SIDE / 0.05
def _bar(high: float, low: float, close: float, open_: float | None = None) -> SimpleNamespace:
"""Synthetic daily bar. ``open`` defaults to the high so a stop is pierced
intraday (fill at the stop level); pass an explicit open beyond the stop to
model a gap through it."""
return SimpleNamespace(
high=high, low=low, close=close, open=open_ if open_ is not None else high
)
class TestStopFillR:
def test_intraday_fill_at_stop(self):
assert bt._stop_fill_r("long", 100.0, 95.0, _bar(101, 94, 96)) == pytest.approx(-1.0)
def test_gap_fill_at_open(self):
# Opens at 92, below the 95 stop → filled at the open, worse than 1R.
assert bt._stop_fill_r("long", 100.0, 95.0, _bar(93, 90, 91, open_=92)) == pytest.approx(-1.6)
def test_short_gap_fill_at_open(self):
# Short stop 105; opens at 107 above it → fill 107.
assert bt._stop_fill_r("short", 100.0, 105.0, _bar(110, 104, 108, open_=107)) == pytest.approx(-1.4)
class TestTakeProfitPrimitives:
def test_long_tp_reachable_before_stop(self):
risk, stopped, mfe, close_pct, stop_day, _ = bt._tp_primitives("long", 100.0, 95.0, [_bar(109, 101, 108)], 30)
assert risk == pytest.approx(0.05)
assert stopped is False
assert mfe == pytest.approx(0.09)
assert close_pct == pytest.approx(0.08)
assert stop_day is None
def test_long_stop_zeroes_mfe(self):
# Low pierces the stop on the only bar → loss, nothing banked before it.
risk, stopped, mfe, close_pct, stop_day, stop_r = bt._tp_primitives("long", 100.0, 95.0, [_bar(101, 94, 96)], 30)
assert stopped is True
assert mfe == pytest.approx(0.0)
assert close_pct == pytest.approx(-0.04)
assert stop_day == 1
assert stop_r == pytest.approx(-1.0)
def test_gap_through_stop_loses_more_than_1r(self):
_, stopped, _, _, stop_day, stop_r = bt._tp_primitives(
"long", 100.0, 95.0, [_bar(93, 90, 91, open_=92)], 30
)
assert stopped is True
assert stop_day == 1
assert stop_r == pytest.approx(-1.6) # filled at the 92 open, not the 95 stop
def test_long_drift_no_trigger(self):
bars = [_bar(102, 99, 101), _bar(103, 100, 102)]
risk, stopped, mfe, close_pct, _, _ = bt._tp_primitives("long", 100.0, 95.0, bars, 30)
assert stopped is False
assert mfe == pytest.approx(0.03)
assert close_pct == pytest.approx(0.02)
def test_short_direction(self):
# short entry 100, stop 105; price falls → favourable = (entry - low)/entry
risk, stopped, mfe, close_pct, _, _ = bt._tp_primitives("short", 100.0, 105.0, [_bar(101, 92, 93)], 30)
assert risk == pytest.approx(0.05)
assert stopped is False
assert mfe == pytest.approx(0.08)
assert close_pct == pytest.approx(0.07)
class TestTakeProfitBucket:
def test_bucket_mix(self):
cands = [
{"risk_pct": 0.05, "mfe_pct": 0.09, "tp_stopped": False, "tp_close_pct": 0.08}, # +1.6R win
{"risk_pct": 0.05, "mfe_pct": 0.02, "tp_stopped": True, "tp_close_pct": -0.04}, # -1R stop
{"risk_pct": 0.05, "mfe_pct": 0.03, "tp_stopped": False, "tp_close_pct": 0.01}, # +0.2R timeout
]
b = bt._take_profit_bucket(cands, 0.08)
assert b["total"] == 3
assert b["wins"] == 1
assert b["hit_rate"] == pytest.approx(33.3, abs=0.1)
assert b["total_r"] == pytest.approx(0.8, abs=0.01)
assert b["avg_r"] == pytest.approx(0.267, abs=0.01)
# net: minus a 0.04R round trip per candidate (risk_pct 0.05)
assert b["net_total_r"] == pytest.approx(0.8 - 3 * _COST_R_005, abs=0.01)
assert b["net_avg_r"] == pytest.approx((0.8 - 3 * _COST_R_005) / 3, abs=0.01)
def test_zero_risk_skipped(self):
cands = [{"risk_pct": 0.0, "mfe_pct": 0.2, "tp_stopped": False, "tp_close_pct": 0.1}]
b = bt._take_profit_bucket(cands, 0.08)
assert b["total"] == 0
assert b["avg_r"] is None
class TestTrailingExits:
def test_locks_gain_on_pullback(self):
# Runs to 120, then a 10% trail (from peak 120 → 108) is pierced on the drop.
res = bt._trailing_exits("long", 100.0, 90.0, (0.10,), [_bar(120, 110, 118), _bar(130, 100, 105)], 30)
assert res[10] == pytest.approx(0.8) # (108-100)/100 / 0.10 risk
def test_initial_stop_caps_loss(self):
# Trail (20%) is looser than the initial stop → initial stop governs = -1R.
res = bt._trailing_exits("long", 100.0, 90.0, (0.20,), [_bar(101, 89, 90)], 30)
assert res[20] == pytest.approx(-1.0)
def test_timeout_exits_at_close(self):
res = bt._trailing_exits("long", 100.0, 90.0, (0.20,), [_bar(105, 98, 104), _bar(106, 100, 105)], 30)
assert res[20] == pytest.approx(0.5) # close 105 → +5% / 10% risk
def test_multiple_widths_one_pass(self):
# Tighter trail locks in more here (exit at 114 vs 108).
res = bt._trailing_exits("long", 100.0, 90.0, (0.10, 0.05), [_bar(120, 110, 118), _bar(130, 100, 105)], 30)
assert res[10] == pytest.approx(0.8)
assert res[5] == pytest.approx(1.4)
def test_gap_through_stop_fills_at_open(self):
# Initial stop 90 governs (20% trail from peak 100 is lower); the bar
# opens at 85, below it → fill at the open.
res = bt._trailing_exits("long", 100.0, 90.0, (0.20,), [_bar(88, 84, 86, open_=85)], 30)
assert res[20] == pytest.approx(-1.5)
class TestTrailingBucket:
def test_bucket(self):
cands = [
{"trail_r": {5: 1.4, 10: 0.8}, "risk_pct": 0.10},
{"trail_r": {5: -1.0, 10: -1.0}, "risk_pct": 0.10},
{"trail_r": {5: 0.5, 10: 0.5}, "risk_pct": 0.10},
]
b = bt._trailing_bucket(cands, 5)
assert b["total"] == 3
assert b["wins"] == 2
assert b["win_rate"] == pytest.approx(66.7, abs=0.1)
assert b["total_r"] == pytest.approx(0.9, abs=0.01)
assert b["avg_r"] == pytest.approx(0.3, abs=0.01)
# net: 0.02R round trip per candidate (risk_pct 0.10)
assert b["net_total_r"] == pytest.approx(0.9 - 3 * 0.02, abs=0.01)
assert b["net_avg_r"] == pytest.approx(0.28, abs=0.01)
class TestTimeExits:
def test_long_exits_at_horizon_close(self):
bars = [_bar(103, 99, 102), _bar(105, 101, 104), _bar(107, 103, 106)]
res = bt._time_exits("long", 100.0, 95.0, bars, (2, 5))
assert res[2] == pytest.approx(0.8) # close 104 → +4% / 5% risk
assert res[5] == pytest.approx(1.2) # only 3 bars → last close 106
def test_stop_on_first_bar_loses_everywhere(self):
res = bt._time_exits("long", 100.0, 95.0, [_bar(101, 94, 96), _bar(105, 101, 104)], (1, 5))
assert res[1] == pytest.approx(-1.0)
assert res[5] == pytest.approx(-1.0)
def test_stop_after_short_horizon_only_hits_long_hold(self):
# Day-2 close banked by the 2-day hold; the stop on day 3 only hits n=5.
bars = [_bar(103, 99, 102), _bar(104, 100, 103), _bar(101, 94, 95)]
res = bt._time_exits("long", 100.0, 95.0, bars, (2, 5))
assert res[2] == pytest.approx(0.6) # close 103 → +3% / 5% risk
assert res[5] == pytest.approx(-1.0)
def test_short_direction(self):
res = bt._time_exits("short", 100.0, 105.0, [_bar(101, 95, 96)], (1,))
assert res[1] == pytest.approx(0.8) # close 96 → +4% / 5% risk
def test_zero_risk_returns_zero(self):
res = bt._time_exits("long", 100.0, 100.0, [_bar(103, 99, 102)], (5,))
assert res[5] == 0.0
def test_gap_through_stop_fills_at_open(self):
res = bt._time_exits("long", 100.0, 95.0, [_bar(93, 90, 91, open_=92)], (5,))
assert res[5] == pytest.approx(-1.6)
class TestTimeExitBucket:
def test_bucket(self):
cands = [
{"time_r": {5: 1.4, 21: 0.8}, "risk_pct": 0.10},
{"time_r": {5: -1.0, 21: -1.0}, "risk_pct": 0.10},
{"time_r": {5: 0.5, 21: 0.5}, "risk_pct": 0.10},
]
b = bt._time_exit_bucket(cands, 5)
assert b["hold_days"] == 5
assert b["total"] == 3
assert b["wins"] == 2
assert b["win_rate"] == pytest.approx(66.7, abs=0.1)
assert b["avg_r"] == pytest.approx(0.3, abs=0.01)
assert b["net_avg_r"] == pytest.approx(0.28, abs=0.01)
assert b["best_r"] == pytest.approx(1.4)
assert b["worst_r"] == pytest.approx(-1.0)
# No stop_day on any candidate → every hold runs the full 5 days.
assert b["avg_hold_days"] == 5.0
assert b["net_r_per_day"] == pytest.approx(0.28 / 5.0, abs=0.001)
def test_missing_hold_skipped(self):
b = bt._time_exit_bucket([{"time_r": {5: 1.0}}], 21)
assert b["total"] == 0
assert b["avg_r"] is None
def _acand(
rr: float = 2.0,
conf: float = 60.0,
action: str = "LONG_MODERATE",
mp: float | None = 90.0,
direction: str = "long",
) -> dict:
"""Ablation candidate: meets_core mirrors the default floors (min_rr 1.2,
min_confidence 55, exclude_neutral on)."""
meets = rr >= 1.2 and conf >= 55.0 and action != "NEUTRAL"
return {
"rr": rr,
"confidence": conf,
"action": action,
"momentum_percentile": mp,
"direction": direction,
"meets_core": meets,
"risk_level": "Low",
"target_hit": True,
"outcome": OUTCOME_TARGET_HIT,
"realized_r": rr,
"risk_pct": 0.05,
"time_r": {d: 0.5 for d in bt.TIME_EXIT_DAYS},
}
class TestGateAblation:
ACTIVATION = {
"min_rr": 1.2,
"min_confidence": 55.0,
"exclude_neutral": True,
"require_high_conviction": False,
"exclude_conflicts": False,
}
def test_variant_counts(self):
cands = [
_acand(), # clears everything
_acand(conf=40.0), # fails confidence floor
_acand(rr=1.0), # fails R:R floor
_acand(action="NEUTRAL"), # fails NEUTRAL exclusion
_acand(mp=50.0), # fails the momentum cutoff
_acand(direction="short", mp=95.0), # short — gated out
]
rows = {r["variant"]: r for r in bt._gate_ablation(cands, self.ACTIVATION, 80.0)}
assert rows["all_floors"]["total"] == 1
assert rows["no_confidence_floor"]["total"] == 2
assert rows["no_rr_floor"]["total"] == 2
assert rows["no_neutral_exclusion"]["total"] == 2
assert rows["momentum_only"]["total"] == 4
assert rows["all_floors"]["net_avg_r"] is not None
# Every variant is also graded under the hold-to-horizon exit.
assert rows["all_floors"]["hold_days"] == max(bt.TIME_EXIT_DAYS)
assert rows["all_floors"]["hold_avg_r"] == pytest.approx(0.5)
assert rows["all_floors"]["hold_net_avg_r"] is not None
assert rows["momentum_only"]["hold_total_r"] == pytest.approx(4 * 0.5, abs=0.01)
def test_threshold_zero_disables_momentum_gate(self):
# Floors only: the short and the low-momentum long both pass all_floors.
cands = [_acand(mp=50.0), _acand(direction="short", mp=None)]
rows = {r["variant"]: r for r in bt._gate_ablation(cands, self.ACTIVATION, 0.0)}
assert rows["all_floors"]["total"] == 2
def _sim_prices(start_ord: int, closes: list[float]) -> tuple:
"""Column arrays for consecutive daily bars: open = close (no gaps),
high/low = close ± 1."""
ords = list(range(start_ord, start_ord + len(closes)))
return (
ords,
list(closes),
[c + 1.0 for c in closes],
[c - 1.0 for c in closes],
list(closes),
[1_000_000] * len(closes),
)
def _sim_cand(
sym: str, day_ord: int, entry: float, stop: float, target: float, mp: float = 90.0
) -> dict:
return {
"qualified": True,
"direction": "long",
"symbol": sym,
"date": date.fromordinal(day_ord).isoformat(),
"entry": entry,
"stop": stop,
"target": target,
"momentum_percentile": mp,
}
class TestSimulatePortfolio:
ORD = date(2025, 1, 6).toordinal()
def test_hold_policy_accounting(self):
closes = [100.0, 102.0, 104.0, 106.0, 108.0, 110.0]
prices = {"AAA": _sim_prices(self.ORD, closes)}
cand = _sim_cand("AAA", self.ORD, entry=100.0, stop=95.0, target=130.0)
sim = bt._simulate_portfolio([cand], prices, None, "hold", 3)
assert sim is not None
assert sim["trades"] == 1
# 20 shares (1% risk / $5 stop distance), exit at the day-3 close 106:
# pnl = 2120 2000 2.00 entry cost 2.12 exit cost = 115.88
assert sim["final_equity"] == pytest.approx(10_115.88, abs=0.01)
assert sim["win_rate"] == 100.0
assert sim["best_trade_r"] == pytest.approx(1.2)
assert sim["avg_hold_days"] == 3.0
assert sim["max_drawdown_pct"] == 0.0
assert sim["cagr_pct"] is None # window far too short to annualize
assert sim["spy_return_pct"] is None
def test_target_policy_exits_at_target(self):
closes = [100.0, 102.0, 104.0, 106.0, 108.0, 110.0]
prices = {"AAA": _sim_prices(self.ORD, closes)}
cand = _sim_cand("AAA", self.ORD, entry=100.0, stop=95.0, target=105.0)
sim = bt._simulate_portfolio([cand], prices, None, "target", 30)
assert sim is not None
assert sim["trades"] == 1
assert sim["best_trade_r"] == pytest.approx(1.0) # filled exactly at 105
def test_stop_gap_fills_at_open(self):
# Day-1 bar gaps to a 90 open, below the 95 stop → fill at the open.
ords = list(range(self.ORD, self.ORD + 2))
prices = {"AAA": (ords, [100.0, 90.0], [101.0, 92.0], [99.0, 88.0], [100.0, 91.0], [1, 1])}
cand = _sim_cand("AAA", self.ORD, entry=100.0, stop=95.0, target=120.0)
sim = bt._simulate_portfolio([cand], prices, None, "hold", 30)
assert sim is not None
assert sim["trades"] == 1
assert sim["worst_trade_r"] == pytest.approx(-2.0) # (90 100) / 5
def test_nothing_qualified_returns_none(self):
assert bt._simulate_portfolio([], {}, None, "hold", 30) is None
def test_bucket_stats_counts_and_expectancy():
cands = [
_cand(70, OUTCOME_TARGET_HIT, 3.0), # +3R win
_cand(60, OUTCOME_TARGET_HIT, 2.0), # +2R win
_cand(40, OUTCOME_STOP_HIT, 3.0), # -1R loss
_cand(30, OUTCOME_EXPIRED, 3.0), # 0R expired
]
s = bt._bucket_stats(cands)
assert s["total"] == 4
assert s["wins"] == 2
assert s["losses"] == 1
assert s["expired"] == 1
# hit rate is over decided (wins+losses) only
assert s["hit_rate"] == round(2 / 3 * 100, 1)
# avg R = (3 + 2 - 1 + 0) / 4 = 1.0
assert s["avg_r"] == 1.0
assert s["total_r"] == 4.0
# net = gross minus a 0.04R round trip per candidate (risk_pct 0.05)
assert s["net_avg_r"] == pytest.approx(1.0 - _COST_R_005, abs=0.001)
assert s["net_total_r"] == pytest.approx(4.0 - 4 * _COST_R_005, abs=0.01)
assert s["best_r"] == 3.0
assert s["worst_r"] == -1.0
assert s["avg_hold_days"] == 10.0
assert s["net_r_per_day"] == pytest.approx((1.0 - _COST_R_005) / 10.0, abs=0.001)
def test_bucket_stats_empty():
s = bt._bucket_stats([])
assert s["total"] == 0
assert s["hit_rate"] is None
assert s["avg_r"] is None
assert s["net_avg_r"] is None
def test_bucket_stats_no_risk_pct_means_no_cost():
c = _cand(50, OUTCOME_TARGET_HIT, 2.0)
del c["risk_pct"]
s = bt._bucket_stats([c])
assert s["net_avg_r"] == s["avg_r"]
assert s["net_total_r"] == s["total_r"]
def test_calibration_buckets():
cands = [
_cand(65, OUTCOME_TARGET_HIT, 2.0),
_cand(62, OUTCOME_STOP_HIT, 2.0),
_cand(15, OUTCOME_STOP_HIT, 2.0),
]
rows = bt._calibration(cands)
by_bucket = {r["bucket"]: r for r in rows}
assert by_bucket["60-80%"]["n"] == 2
assert by_bucket["60-80%"]["realized_hit_rate"] == 50.0 # 1 of 2 hit
assert by_bucket["0-20%"]["n"] == 1
assert by_bucket["0-20%"]["realized_hit_rate"] == 0.0
def test_window_setups_too_short_returns_empty():
assert bt._window_setups([], {}, {}) == []
def test_replay_ticker_candidates_carry_gate_fields():
"""The ablation recomputes floors from candidate fields — a candidate missing
action/risk_level silently zeroes the ablation rows (July 2026 regression)."""
from app.services.admin_service import ACTIVATION_DEFAULTS
from app.services.recommendation_service import DEFAULT_RECOMMENDATION_CONFIG
base = date(2025, 1, 1)
bars = []
for i in range(160):
close = 100.0 + 8.0 * math.sin(i / 6.0)
bars.append(SimpleNamespace(
date=base + timedelta(days=i),
open=close,
high=close + 1.5,
low=close - 1.5,
close=close,
volume=1_000_000 + (i % 5) * 1000,
))
cands = bt._replay_ticker(
"OSC", bars, dict(DEFAULT_RECOMMENDATION_CONFIG), dict(ACTIVATION_DEFAULTS)
)
assert cands, "expected the oscillating series to produce candidates"
for c in cands:
assert c.get("action") is not None
assert "risk_level" in c
async def _seed_oscillating_ticker(session, symbol: str, n: int = 160) -> None:
t = Ticker(symbol=symbol)
session.add(t)
await session.flush()
base = date(2025, 1, 1)
for i in range(n):
close = 100.0 + 8.0 * math.sin(i / 6.0)
session.add(OHLCVRecord(
ticker_id=t.id,
date=base + timedelta(days=i),
open=close,
high=close + 1.5,
low=close - 1.5,
close=close,
volume=1_000_000 + (i % 5) * 1000,
))
await session.commit()
async def test_run_backtest_smoke(session):
await _seed_oscillating_ticker(session, "OSC")
report = await bt.run_backtest(session)
# well-formed report
assert report["tickers"] == 1
assert isinstance(report["candidates"], int)
for key in (
"overall_qualified", "overall_all", "by_direction", "calibration", "sweep",
"gate_ablation", "time_exit_sweep",
):
assert key in report
# the oscillating series should yield at least some resolved setups
assert report["candidates"] >= 1
# cost assumption is reported, and every bucket carries net numbers
assert report["params"]["cost_per_side_pct"] == pytest.approx(bt.COST_PER_SIDE * 100)
assert "net_avg_r" in report["overall_all"]
# ablation baseline reproduces the qualified set exactly, and every row
# carries the hold-to-horizon grading alongside the target model
ablation = {r["variant"]: r for r in report["gate_ablation"]}
assert ablation["all_floors"]["total"] == report["overall_qualified"]["total"]
for row in report["gate_ablation"]:
assert "hold_net_avg_r" in row
# time-exit sweep covers the configured hold lengths
assert [r["hold_days"] for r in report["time_exit_sweep"]] == list(bt.TIME_EXIT_DAYS)
# portfolio simulation section is always present (policies may be empty
# when nothing qualifies)
assert "portfolio_sim" in report
assert isinstance(report["portfolio_sim"]["policies"], list)
assert report["portfolio_sim"]["params"]["max_positions"] == bt.SIM_MAX_POSITIONS
# sweep: lowering the momentum-percentile cutoff can only add qualifiers
sweep = sorted(report["sweep"], key=lambda r: r["min_momentum_percentile"], reverse=True)
counts = [r["total"] for r in sweep]
assert counts == sorted(counts) # ascending as threshold descends
# every calibration row is internally consistent
for row in report["calibration"]:
assert 0 <= row["realized_hit_rate"] <= 100
assert row["n"] >= 1