Files
signal-platform/tests/unit/test_backtest_service.py
T
dennisthiessen 243e369e9a
Deploy / lint (push) Successful in 6s
Deploy / test (push) Successful in 54s
Deploy / deploy (push) Successful in 32s
feat: robustness stats + dynamic recommendation; retire settled report sections
Robustness (answers 'is the edge just outliers?'):
- _bucket_stats gains median_net_r, profit_factor, and net_avg_r_ex_top5
  (expectancy with the top 5% of winners removed); shown as stat tiles.
- Portfolio sim gains per-calendar-year returns, shown in the sim table.

Dynamic recommendation ('What this backtest recommends' panel):
- _build_recommendation derives advice from the report's own numbers on
  every run — exit policy (target vs best hold, with sim CAGRs), which
  gate floors earn their keep (ablation Hold column), best momentum
  cutoff, book-vs-SPY verdict, and an outlier-dependence warning when
  the trimmed expectancy goes non-positive.

Retired (conclusions reached, tables removed from report + UI):
- Take-profit sweep (no interior optimum — fixed TP is the wrong tool
  for momentum), trailing sweep (converged to the hold-to-horizon exit),
  probability calibration (model is display-only by decision).
- _tp_primitives slimmed to _risk_and_stop_day; trailing machinery gone.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-07-02 12:33:22 +02:00

477 lines
18 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Tests for the historical backtest harness."""
from __future__ import annotations
import math
from datetime import date, timedelta
from types import SimpleNamespace
import pytest
from app.models.ohlcv import OHLCVRecord
from app.models.ticker import Ticker
from app.services import backtest_service as bt
from app.services.outcome_service import (
OUTCOME_EXPIRED,
OUTCOME_STOP_HIT,
OUTCOME_TARGET_HIT,
)
from tests.conftest import _test_session_factory # type: ignore
@pytest.fixture
async def session():
async with _test_session_factory() as s:
yield s
def _cand(
prob: float,
outcome: str,
rr: float,
qualified: bool = True,
direction: str = "long",
risk_pct: float = 0.05,
hold_days: int = 10,
) -> dict:
target_hit = outcome == OUTCOME_TARGET_HIT
realized = rr if target_hit else (0.0 if outcome == OUTCOME_EXPIRED else -1.0)
return {
"primary_prob": prob,
"outcome": outcome,
"target_hit": target_hit,
"rr": rr,
"realized_r": realized,
"qualified": qualified,
"direction": direction,
"risk_pct": risk_pct,
"hold_days": hold_days,
}
# Round-trip cost in R for the default _cand risk_pct: 2 * 0.001 / 0.05 = 0.04R.
_COST_R_005 = 2 * bt.COST_PER_SIDE / 0.05
def _bar(high: float, low: float, close: float, open_: float | None = None) -> SimpleNamespace:
"""Synthetic daily bar. ``open`` defaults to the high so a stop is pierced
intraday (fill at the stop level); pass an explicit open beyond the stop to
model a gap through it."""
return SimpleNamespace(
high=high, low=low, close=close, open=open_ if open_ is not None else high
)
class TestStopFillR:
def test_intraday_fill_at_stop(self):
assert bt._stop_fill_r("long", 100.0, 95.0, _bar(101, 94, 96)) == pytest.approx(-1.0)
def test_gap_fill_at_open(self):
# Opens at 92, below the 95 stop → filled at the open, worse than 1R.
assert bt._stop_fill_r("long", 100.0, 95.0, _bar(93, 90, 91, open_=92)) == pytest.approx(-1.6)
def test_short_gap_fill_at_open(self):
# Short stop 105; opens at 107 above it → fill 107.
assert bt._stop_fill_r("short", 100.0, 105.0, _bar(110, 104, 108, open_=107)) == pytest.approx(-1.4)
class TestRiskAndStopDay:
def test_no_stop(self):
risk, stop_day = bt._risk_and_stop_day("long", 100.0, 95.0, [_bar(109, 101, 108)], 30)
assert risk == pytest.approx(0.05)
assert stop_day is None
def test_stop_day_is_one_based(self):
bars = [_bar(102, 99, 101), _bar(101, 94, 96)]
risk, stop_day = bt._risk_and_stop_day("long", 100.0, 95.0, bars, 30)
assert risk == pytest.approx(0.05)
assert stop_day == 2
def test_short_direction(self):
_, stop_day = bt._risk_and_stop_day("short", 100.0, 105.0, [_bar(106, 101, 104)], 30)
assert stop_day == 1
class TestTimeExits:
def test_long_exits_at_horizon_close(self):
bars = [_bar(103, 99, 102), _bar(105, 101, 104), _bar(107, 103, 106)]
res = bt._time_exits("long", 100.0, 95.0, bars, (2, 5))
assert res[2] == pytest.approx(0.8) # close 104 → +4% / 5% risk
assert res[5] == pytest.approx(1.2) # only 3 bars → last close 106
def test_stop_on_first_bar_loses_everywhere(self):
res = bt._time_exits("long", 100.0, 95.0, [_bar(101, 94, 96), _bar(105, 101, 104)], (1, 5))
assert res[1] == pytest.approx(-1.0)
assert res[5] == pytest.approx(-1.0)
def test_stop_after_short_horizon_only_hits_long_hold(self):
# Day-2 close banked by the 2-day hold; the stop on day 3 only hits n=5.
bars = [_bar(103, 99, 102), _bar(104, 100, 103), _bar(101, 94, 95)]
res = bt._time_exits("long", 100.0, 95.0, bars, (2, 5))
assert res[2] == pytest.approx(0.6) # close 103 → +3% / 5% risk
assert res[5] == pytest.approx(-1.0)
def test_short_direction(self):
res = bt._time_exits("short", 100.0, 105.0, [_bar(101, 95, 96)], (1,))
assert res[1] == pytest.approx(0.8) # close 96 → +4% / 5% risk
def test_zero_risk_returns_zero(self):
res = bt._time_exits("long", 100.0, 100.0, [_bar(103, 99, 102)], (5,))
assert res[5] == 0.0
def test_gap_through_stop_fills_at_open(self):
res = bt._time_exits("long", 100.0, 95.0, [_bar(93, 90, 91, open_=92)], (5,))
assert res[5] == pytest.approx(-1.6)
class TestTimeExitBucket:
def test_bucket(self):
cands = [
{"time_r": {5: 1.4, 21: 0.8}, "risk_pct": 0.10},
{"time_r": {5: -1.0, 21: -1.0}, "risk_pct": 0.10},
{"time_r": {5: 0.5, 21: 0.5}, "risk_pct": 0.10},
]
b = bt._time_exit_bucket(cands, 5)
assert b["hold_days"] == 5
assert b["total"] == 3
assert b["wins"] == 2
assert b["win_rate"] == pytest.approx(66.7, abs=0.1)
assert b["avg_r"] == pytest.approx(0.3, abs=0.01)
assert b["net_avg_r"] == pytest.approx(0.28, abs=0.01)
assert b["best_r"] == pytest.approx(1.4)
assert b["worst_r"] == pytest.approx(-1.0)
# No stop_day on any candidate → every hold runs the full 5 days.
assert b["avg_hold_days"] == 5.0
assert b["net_r_per_day"] == pytest.approx(0.28 / 5.0, abs=0.001)
def test_missing_hold_skipped(self):
b = bt._time_exit_bucket([{"time_r": {5: 1.0}}], 21)
assert b["total"] == 0
assert b["avg_r"] is None
def _acand(
rr: float = 2.0,
conf: float = 60.0,
action: str = "LONG_MODERATE",
mp: float | None = 90.0,
direction: str = "long",
) -> dict:
"""Ablation candidate: meets_core mirrors the default floors (min_rr 1.2,
min_confidence 55, exclude_neutral on)."""
meets = rr >= 1.2 and conf >= 55.0 and action != "NEUTRAL"
return {
"rr": rr,
"confidence": conf,
"action": action,
"momentum_percentile": mp,
"direction": direction,
"meets_core": meets,
"risk_level": "Low",
"target_hit": True,
"outcome": OUTCOME_TARGET_HIT,
"realized_r": rr,
"risk_pct": 0.05,
"time_r": {d: 0.5 for d in bt.TIME_EXIT_DAYS},
}
class TestGateAblation:
ACTIVATION = {
"min_rr": 1.2,
"min_confidence": 55.0,
"exclude_neutral": True,
"require_high_conviction": False,
"exclude_conflicts": False,
}
def test_variant_counts(self):
cands = [
_acand(), # clears everything
_acand(conf=40.0), # fails confidence floor
_acand(rr=1.0), # fails R:R floor
_acand(action="NEUTRAL"), # fails NEUTRAL exclusion
_acand(mp=50.0), # fails the momentum cutoff
_acand(direction="short", mp=95.0), # short — gated out
]
rows = {r["variant"]: r for r in bt._gate_ablation(cands, self.ACTIVATION, 80.0)}
assert rows["all_floors"]["total"] == 1
assert rows["no_confidence_floor"]["total"] == 2
assert rows["no_rr_floor"]["total"] == 2
assert rows["no_neutral_exclusion"]["total"] == 2
assert rows["momentum_only"]["total"] == 4
assert rows["all_floors"]["net_avg_r"] is not None
# Every variant is also graded under the hold-to-horizon exit.
assert rows["all_floors"]["hold_days"] == max(bt.TIME_EXIT_DAYS)
assert rows["all_floors"]["hold_avg_r"] == pytest.approx(0.5)
assert rows["all_floors"]["hold_net_avg_r"] is not None
assert rows["momentum_only"]["hold_total_r"] == pytest.approx(4 * 0.5, abs=0.01)
def test_threshold_zero_disables_momentum_gate(self):
# Floors only: the short and the low-momentum long both pass all_floors.
cands = [_acand(mp=50.0), _acand(direction="short", mp=None)]
rows = {r["variant"]: r for r in bt._gate_ablation(cands, self.ACTIVATION, 0.0)}
assert rows["all_floors"]["total"] == 2
def _sim_prices(start_ord: int, closes: list[float]) -> tuple:
"""Column arrays for consecutive daily bars: open = close (no gaps),
high/low = close ± 1."""
ords = list(range(start_ord, start_ord + len(closes)))
return (
ords,
list(closes),
[c + 1.0 for c in closes],
[c - 1.0 for c in closes],
list(closes),
[1_000_000] * len(closes),
)
def _sim_cand(
sym: str, day_ord: int, entry: float, stop: float, target: float, mp: float = 90.0
) -> dict:
return {
"qualified": True,
"direction": "long",
"symbol": sym,
"date": date.fromordinal(day_ord).isoformat(),
"entry": entry,
"stop": stop,
"target": target,
"momentum_percentile": mp,
}
class TestSimulatePortfolio:
ORD = date(2025, 1, 6).toordinal()
def test_hold_policy_accounting(self):
closes = [100.0, 102.0, 104.0, 106.0, 108.0, 110.0]
prices = {"AAA": _sim_prices(self.ORD, closes)}
cand = _sim_cand("AAA", self.ORD, entry=100.0, stop=95.0, target=130.0)
sim = bt._simulate_portfolio([cand], prices, None, "hold", 3)
assert sim is not None
assert sim["trades"] == 1
# 20 shares (1% risk / $5 stop distance), exit at the day-3 close 106:
# pnl = 2120 2000 2.00 entry cost 2.12 exit cost = 115.88
assert sim["final_equity"] == pytest.approx(10_115.88, abs=0.01)
assert sim["win_rate"] == 100.0
assert sim["best_trade_r"] == pytest.approx(1.2)
assert sim["avg_hold_days"] == 3.0
assert sim["max_drawdown_pct"] == 0.0
assert sim["cagr_pct"] is None # window far too short to annualize
assert sim["spy_return_pct"] is None
assert sim["yearly_returns"] == [
{"year": 2025, "return_pct": pytest.approx(1.2, abs=0.05)}
]
def test_target_policy_exits_at_target(self):
closes = [100.0, 102.0, 104.0, 106.0, 108.0, 110.0]
prices = {"AAA": _sim_prices(self.ORD, closes)}
cand = _sim_cand("AAA", self.ORD, entry=100.0, stop=95.0, target=105.0)
sim = bt._simulate_portfolio([cand], prices, None, "target", 30)
assert sim is not None
assert sim["trades"] == 1
assert sim["best_trade_r"] == pytest.approx(1.0) # filled exactly at 105
def test_stop_gap_fills_at_open(self):
# Day-1 bar gaps to a 90 open, below the 95 stop → fill at the open.
ords = list(range(self.ORD, self.ORD + 2))
prices = {"AAA": (ords, [100.0, 90.0], [101.0, 92.0], [99.0, 88.0], [100.0, 91.0], [1, 1])}
cand = _sim_cand("AAA", self.ORD, entry=100.0, stop=95.0, target=120.0)
sim = bt._simulate_portfolio([cand], prices, None, "hold", 30)
assert sim is not None
assert sim["trades"] == 1
assert sim["worst_trade_r"] == pytest.approx(-2.0) # (90 100) / 5
def test_nothing_qualified_returns_none(self):
assert bt._simulate_portfolio([], {}, None, "hold", 30) is None
def test_bucket_stats_counts_and_expectancy():
cands = [
_cand(70, OUTCOME_TARGET_HIT, 3.0), # +3R win
_cand(60, OUTCOME_TARGET_HIT, 2.0), # +2R win
_cand(40, OUTCOME_STOP_HIT, 3.0), # -1R loss
_cand(30, OUTCOME_EXPIRED, 3.0), # 0R expired
]
s = bt._bucket_stats(cands)
assert s["total"] == 4
assert s["wins"] == 2
assert s["losses"] == 1
assert s["expired"] == 1
# hit rate is over decided (wins+losses) only
assert s["hit_rate"] == round(2 / 3 * 100, 1)
# avg R = (3 + 2 - 1 + 0) / 4 = 1.0
assert s["avg_r"] == 1.0
assert s["total_r"] == 4.0
# net = gross minus a 0.04R round trip per candidate (risk_pct 0.05)
assert s["net_avg_r"] == pytest.approx(1.0 - _COST_R_005, abs=0.001)
assert s["net_total_r"] == pytest.approx(4.0 - 4 * _COST_R_005, abs=0.01)
assert s["best_r"] == 3.0
assert s["worst_r"] == -1.0
assert s["avg_hold_days"] == 10.0
assert s["net_r_per_day"] == pytest.approx((1.0 - _COST_R_005) / 10.0, abs=0.001)
# robustness: net rs are [2.96, 1.96, -1.04, -0.04]
assert s["median_net_r"] == pytest.approx(0.96, abs=0.001)
assert s["profit_factor"] == pytest.approx(4.92 / 1.08, abs=0.01)
# ex-top-5%: ceil(4 * 0.05) = 1 winner trimmed → mean of the remaining three
assert s["net_avg_r_ex_top5"] == pytest.approx((1.96 - 1.04 - 0.04) / 3, abs=0.001)
def test_bucket_stats_empty():
s = bt._bucket_stats([])
assert s["total"] == 0
assert s["hit_rate"] is None
assert s["avg_r"] is None
assert s["net_avg_r"] is None
def test_bucket_stats_no_risk_pct_means_no_cost():
c = _cand(50, OUTCOME_TARGET_HIT, 2.0)
del c["risk_pct"]
s = bt._bucket_stats([c])
assert s["net_avg_r"] == s["avg_r"]
assert s["net_total_r"] == s["total_r"]
def test_build_recommendation_reads_the_report():
report = {
"overall_qualified": {"net_avg_r": 0.13, "net_avg_r_ex_top5": 0.05},
"time_exit_sweep": [
{"hold_days": 21, "net_avg_r": 0.38},
{"hold_days": 30, "net_avg_r": 0.50},
],
"gate_ablation": [
{"variant": "all_floors", "total": 100, "hold_net_avg_r": 0.50},
{"variant": "no_confidence_floor", "total": 130, "hold_net_avg_r": 0.49},
{"variant": "no_rr_floor", "total": 400, "hold_net_avg_r": 0.34},
{"variant": "no_neutral_exclusion", "total": 120, "hold_net_avg_r": 0.46},
],
"sweep": [
{"min_momentum_percentile": 80.0, "net_avg_r": 0.13, "total": 100},
{"min_momentum_percentile": 60.0, "net_avg_r": 0.05, "total": 300},
{"min_momentum_percentile": 0.0, "net_avg_r": -0.12, "total": 1000},
],
"portfolio_sim": {"policies": [
{"policy": "target", "cagr_pct": 23.7, "total_return_pct": 134.8,
"spy_return_pct": 95.9, "max_drawdown_pct": 20.7},
{"policy": "hold", "cagr_pct": 31.9, "total_return_pct": 203.6,
"spy_return_pct": 95.9, "max_drawdown_pct": 21.2},
]},
}
rec = bt._build_recommendation(report)
by_topic: dict[str, list[str]] = {}
for item in rec["items"]:
by_topic.setdefault(item["topic"], []).append(item["text"])
assert rec["headline"] is not None and "hold 30" in rec["headline"]
assert any("hold 30 trading days" in t for t in by_topic["exit"])
gate_texts = " | ".join(by_topic["gate"])
assert "confidence floor adds nothing" in gate_texts
assert "keep the R:R floor" in gate_texts
assert "keep the NEUTRAL exclusion" in gate_texts
assert "80" in by_topic["cutoff"][0]
assert "beats" in by_topic["benchmark"][0]
assert any("not a handful of outliers" in t for t in by_topic["robustness"])
def test_build_recommendation_flags_outlier_dependence():
rec = bt._build_recommendation({
"overall_qualified": {"net_avg_r": 0.13, "net_avg_r_ex_top5": -0.02},
})
robustness = [i["text"] for i in rec["items"] if i["topic"] == "robustness"]
assert robustness and "WARNING" in robustness[0]
def test_window_setups_too_short_returns_empty():
assert bt._window_setups([], {}, {}) == []
def test_replay_ticker_candidates_carry_gate_fields():
"""The ablation recomputes floors from candidate fields — a candidate missing
action/risk_level silently zeroes the ablation rows (July 2026 regression)."""
from app.services.admin_service import ACTIVATION_DEFAULTS
from app.services.recommendation_service import DEFAULT_RECOMMENDATION_CONFIG
base = date(2025, 1, 1)
bars = []
for i in range(160):
close = 100.0 + 8.0 * math.sin(i / 6.0)
bars.append(SimpleNamespace(
date=base + timedelta(days=i),
open=close,
high=close + 1.5,
low=close - 1.5,
close=close,
volume=1_000_000 + (i % 5) * 1000,
))
cands = bt._replay_ticker(
"OSC", bars, dict(DEFAULT_RECOMMENDATION_CONFIG), dict(ACTIVATION_DEFAULTS)
)
assert cands, "expected the oscillating series to produce candidates"
for c in cands:
assert c.get("action") is not None
assert "risk_level" in c
async def _seed_oscillating_ticker(session, symbol: str, n: int = 160) -> None:
t = Ticker(symbol=symbol)
session.add(t)
await session.flush()
base = date(2025, 1, 1)
for i in range(n):
close = 100.0 + 8.0 * math.sin(i / 6.0)
session.add(OHLCVRecord(
ticker_id=t.id,
date=base + timedelta(days=i),
open=close,
high=close + 1.5,
low=close - 1.5,
close=close,
volume=1_000_000 + (i % 5) * 1000,
))
await session.commit()
async def test_run_backtest_smoke(session):
await _seed_oscillating_ticker(session, "OSC")
report = await bt.run_backtest(session)
# well-formed report
assert report["tickers"] == 1
assert isinstance(report["candidates"], int)
for key in (
"overall_qualified", "overall_all", "by_direction", "sweep",
"gate_ablation", "time_exit_sweep", "portfolio_sim", "recommendation",
):
assert key in report
# the oscillating series should yield at least some resolved setups
assert report["candidates"] >= 1
# cost assumption is reported, and every bucket carries net numbers
assert report["params"]["cost_per_side_pct"] == pytest.approx(bt.COST_PER_SIDE * 100)
assert "net_avg_r" in report["overall_all"]
# ablation baseline reproduces the qualified set exactly, and every row
# carries the hold-to-horizon grading alongside the target model
ablation = {r["variant"]: r for r in report["gate_ablation"]}
assert ablation["all_floors"]["total"] == report["overall_qualified"]["total"]
for row in report["gate_ablation"]:
assert "hold_net_avg_r" in row
# time-exit sweep covers the configured hold lengths
assert [r["hold_days"] for r in report["time_exit_sweep"]] == list(bt.TIME_EXIT_DAYS)
# portfolio simulation section is always present (policies may be empty
# when nothing qualifies)
assert "portfolio_sim" in report
assert isinstance(report["portfolio_sim"]["policies"], list)
assert report["portfolio_sim"]["params"]["max_positions"] == bt.SIM_MAX_POSITIONS
# sweep: lowering the momentum-percentile cutoff can only add qualifiers
sweep = sorted(report["sweep"], key=lambda r: r["min_momentum_percentile"], reverse=True)
counts = [r["total"] for r in sweep]
assert counts == sorted(counts) # ascending as threshold descends