"""Tests for the historical backtest harness.""" from __future__ import annotations import math from datetime import date, timedelta from types import SimpleNamespace import pytest from app.models.ohlcv import OHLCVRecord from app.models.ticker import Ticker from app.services import backtest_service as bt from app.services.outcome_service import ( OUTCOME_EXPIRED, OUTCOME_STOP_HIT, OUTCOME_TARGET_HIT, ) from tests.conftest import _test_session_factory # type: ignore @pytest.fixture async def session(): async with _test_session_factory() as s: yield s def _cand( prob: float, outcome: str, rr: float, qualified: bool = True, direction: str = "long", risk_pct: float = 0.05, hold_days: int = 10, ) -> dict: target_hit = outcome == OUTCOME_TARGET_HIT realized = rr if target_hit else (0.0 if outcome == OUTCOME_EXPIRED else -1.0) return { "primary_prob": prob, "outcome": outcome, "target_hit": target_hit, "rr": rr, "realized_r": realized, "qualified": qualified, "direction": direction, "risk_pct": risk_pct, "hold_days": hold_days, } # Round-trip cost in R for the default _cand risk_pct: 2 * 0.001 / 0.05 = 0.04R. _COST_R_005 = 2 * bt.COST_PER_SIDE / 0.05 def _bar(high: float, low: float, close: float, open_: float | None = None) -> SimpleNamespace: """Synthetic daily bar. ``open`` defaults to the high so a stop is pierced intraday (fill at the stop level); pass an explicit open beyond the stop to model a gap through it.""" return SimpleNamespace( high=high, low=low, close=close, open=open_ if open_ is not None else high ) class TestStopFillR: def test_intraday_fill_at_stop(self): assert bt._stop_fill_r("long", 100.0, 95.0, _bar(101, 94, 96)) == pytest.approx(-1.0) def test_gap_fill_at_open(self): # Opens at 92, below the 95 stop → filled at the open, worse than −1R. assert bt._stop_fill_r("long", 100.0, 95.0, _bar(93, 90, 91, open_=92)) == pytest.approx(-1.6) def test_short_gap_fill_at_open(self): # Short stop 105; opens at 107 above it → fill 107. assert bt._stop_fill_r("short", 100.0, 105.0, _bar(110, 104, 108, open_=107)) == pytest.approx(-1.4) class TestRiskAndStopDay: def test_no_stop(self): risk, stop_day = bt._risk_and_stop_day("long", 100.0, 95.0, [_bar(109, 101, 108)], 30) assert risk == pytest.approx(0.05) assert stop_day is None def test_stop_day_is_one_based(self): bars = [_bar(102, 99, 101), _bar(101, 94, 96)] risk, stop_day = bt._risk_and_stop_day("long", 100.0, 95.0, bars, 30) assert risk == pytest.approx(0.05) assert stop_day == 2 def test_short_direction(self): _, stop_day = bt._risk_and_stop_day("short", 100.0, 105.0, [_bar(106, 101, 104)], 30) assert stop_day == 1 class TestTimeExits: def test_long_exits_at_horizon_close(self): bars = [_bar(103, 99, 102), _bar(105, 101, 104), _bar(107, 103, 106)] res = bt._time_exits("long", 100.0, 95.0, bars, (2, 5)) assert res[2] == pytest.approx(0.8) # close 104 → +4% / 5% risk assert res[5] == pytest.approx(1.2) # only 3 bars → last close 106 def test_stop_on_first_bar_loses_everywhere(self): res = bt._time_exits("long", 100.0, 95.0, [_bar(101, 94, 96), _bar(105, 101, 104)], (1, 5)) assert res[1] == pytest.approx(-1.0) assert res[5] == pytest.approx(-1.0) def test_stop_after_short_horizon_only_hits_long_hold(self): # Day-2 close banked by the 2-day hold; the stop on day 3 only hits n=5. bars = [_bar(103, 99, 102), _bar(104, 100, 103), _bar(101, 94, 95)] res = bt._time_exits("long", 100.0, 95.0, bars, (2, 5)) assert res[2] == pytest.approx(0.6) # close 103 → +3% / 5% risk assert res[5] == pytest.approx(-1.0) def test_short_direction(self): res = bt._time_exits("short", 100.0, 105.0, [_bar(101, 95, 96)], (1,)) assert res[1] == pytest.approx(0.8) # close 96 → +4% / 5% risk def test_zero_risk_returns_zero(self): res = bt._time_exits("long", 100.0, 100.0, [_bar(103, 99, 102)], (5,)) assert res[5] == 0.0 def test_gap_through_stop_fills_at_open(self): res = bt._time_exits("long", 100.0, 95.0, [_bar(93, 90, 91, open_=92)], (5,)) assert res[5] == pytest.approx(-1.6) class TestTimeExitBucket: def test_bucket(self): cands = [ {"time_r": {5: 1.4, 21: 0.8}, "risk_pct": 0.10}, {"time_r": {5: -1.0, 21: -1.0}, "risk_pct": 0.10}, {"time_r": {5: 0.5, 21: 0.5}, "risk_pct": 0.10}, ] b = bt._time_exit_bucket(cands, 5) assert b["hold_days"] == 5 assert b["total"] == 3 assert b["wins"] == 2 assert b["win_rate"] == pytest.approx(66.7, abs=0.1) assert b["avg_r"] == pytest.approx(0.3, abs=0.01) assert b["net_avg_r"] == pytest.approx(0.28, abs=0.01) assert b["best_r"] == pytest.approx(1.4) assert b["worst_r"] == pytest.approx(-1.0) # No stop_day on any candidate → every hold runs the full 5 days. assert b["avg_hold_days"] == 5.0 assert b["net_r_per_day"] == pytest.approx(0.28 / 5.0, abs=0.001) # robustness on net rs [1.38, -1.02, 0.48] assert b["median_net_r"] == pytest.approx(0.48, abs=0.001) assert b["profit_factor"] == pytest.approx(1.86 / 1.02, abs=0.01) assert b["net_avg_r_ex_top5"] == pytest.approx((0.48 - 1.02) / 2, abs=0.001) def test_missing_hold_skipped(self): b = bt._time_exit_bucket([{"time_r": {5: 1.0}}], 21) assert b["total"] == 0 assert b["avg_r"] is None def _acand( rr: float = 2.0, conf: float = 60.0, action: str = "LONG_MODERATE", mp: float | None = 90.0, direction: str = "long", ) -> dict: """Ablation candidate: meets_core mirrors the default floors (min_rr 1.2, min_confidence 55, exclude_neutral on).""" meets = rr >= 1.2 and conf >= 55.0 and action != "NEUTRAL" return { "rr": rr, "confidence": conf, "action": action, "momentum_percentile": mp, "direction": direction, "meets_core": meets, "risk_level": "Low", "target_hit": True, "outcome": OUTCOME_TARGET_HIT, "realized_r": rr, "risk_pct": 0.05, "time_r": {d: 0.5 for d in bt.TIME_EXIT_DAYS}, } class TestGateAblation: ACTIVATION = { "min_rr": 1.2, "min_confidence": 55.0, "exclude_neutral": True, "require_high_conviction": False, "exclude_conflicts": False, } def test_variant_counts(self): cands = [ _acand(), # clears everything _acand(conf=40.0), # fails confidence floor _acand(rr=1.0), # fails R:R floor _acand(action="NEUTRAL"), # fails NEUTRAL exclusion _acand(mp=50.0), # fails the momentum cutoff _acand(direction="short", mp=95.0), # short — gated out ] rows = {r["variant"]: r for r in bt._gate_ablation(cands, self.ACTIVATION, 80.0)} assert rows["all_floors"]["total"] == 1 assert rows["no_confidence_floor"]["total"] == 2 assert rows["no_rr_floor"]["total"] == 2 assert rows["no_neutral_exclusion"]["total"] == 2 assert rows["momentum_only"]["total"] == 4 assert rows["all_floors"]["net_avg_r"] is not None # Every variant is also graded under the hold-to-horizon exit. assert rows["all_floors"]["hold_days"] == max(bt.TIME_EXIT_DAYS) assert rows["all_floors"]["hold_avg_r"] == pytest.approx(0.5) assert rows["all_floors"]["hold_net_avg_r"] is not None assert rows["momentum_only"]["hold_total_r"] == pytest.approx(4 * 0.5, abs=0.01) def test_threshold_zero_disables_momentum_gate(self): # Floors only: the short and the low-momentum long both pass all_floors. cands = [_acand(mp=50.0), _acand(direction="short", mp=None)] rows = {r["variant"]: r for r in bt._gate_ablation(cands, self.ACTIVATION, 0.0)} assert rows["all_floors"]["total"] == 2 def _sim_prices(start_ord: int, closes: list[float]) -> tuple: """Column arrays for consecutive daily bars: open = close (no gaps), high/low = close ± 1.""" ords = list(range(start_ord, start_ord + len(closes))) return ( ords, list(closes), [c + 1.0 for c in closes], [c - 1.0 for c in closes], list(closes), [1_000_000] * len(closes), ) def _sim_cand( sym: str, day_ord: int, entry: float, stop: float, target: float, mp: float = 90.0 ) -> dict: return { "qualified": True, "direction": "long", "symbol": sym, "date": date.fromordinal(day_ord).isoformat(), "entry": entry, "stop": stop, "target": target, "momentum_percentile": mp, } class TestSimulatePortfolio: ORD = date(2025, 1, 6).toordinal() def test_hold_policy_accounting(self): closes = [100.0, 102.0, 104.0, 106.0, 108.0, 110.0] prices = {"AAA": _sim_prices(self.ORD, closes)} cand = _sim_cand("AAA", self.ORD, entry=100.0, stop=95.0, target=130.0) sim = bt._simulate_portfolio([cand], prices, None, "hold", 3) assert sim is not None assert sim["trades"] == 1 # 20 shares (1% risk / $5 stop distance), exit at the day-3 close 106: # pnl = 2120 − 2000 − 2.00 entry cost − 2.12 exit cost = 115.88 assert sim["final_equity"] == pytest.approx(10_115.88, abs=0.01) assert sim["win_rate"] == 100.0 assert sim["best_trade_r"] == pytest.approx(1.2) assert sim["avg_hold_days"] == 3.0 assert sim["max_drawdown_pct"] == 0.0 assert sim["cagr_pct"] is None # window far too short to annualize assert sim["spy_return_pct"] is None assert sim["yearly_returns"] == [ {"year": 2025, "return_pct": pytest.approx(1.2, abs=0.05)} ] def test_target_policy_exits_at_target(self): closes = [100.0, 102.0, 104.0, 106.0, 108.0, 110.0] prices = {"AAA": _sim_prices(self.ORD, closes)} cand = _sim_cand("AAA", self.ORD, entry=100.0, stop=95.0, target=105.0) sim = bt._simulate_portfolio([cand], prices, None, "target", 30) assert sim is not None assert sim["trades"] == 1 assert sim["best_trade_r"] == pytest.approx(1.0) # filled exactly at 105 def test_stop_gap_fills_at_open(self): # Day-1 bar gaps to a 90 open, below the 95 stop → fill at the open. ords = list(range(self.ORD, self.ORD + 2)) prices = {"AAA": (ords, [100.0, 90.0], [101.0, 92.0], [99.0, 88.0], [100.0, 91.0], [1, 1])} cand = _sim_cand("AAA", self.ORD, entry=100.0, stop=95.0, target=120.0) sim = bt._simulate_portfolio([cand], prices, None, "hold", 30) assert sim is not None assert sim["trades"] == 1 assert sim["worst_trade_r"] == pytest.approx(-2.0) # (90 − 100) / 5 def test_nothing_qualified_returns_none(self): assert bt._simulate_portfolio([], {}, None, "hold", 30) is None def test_bucket_stats_counts_and_expectancy(): cands = [ _cand(70, OUTCOME_TARGET_HIT, 3.0), # +3R win _cand(60, OUTCOME_TARGET_HIT, 2.0), # +2R win _cand(40, OUTCOME_STOP_HIT, 3.0), # -1R loss _cand(30, OUTCOME_EXPIRED, 3.0), # 0R expired ] s = bt._bucket_stats(cands) assert s["total"] == 4 assert s["wins"] == 2 assert s["losses"] == 1 assert s["expired"] == 1 # hit rate is over decided (wins+losses) only assert s["hit_rate"] == round(2 / 3 * 100, 1) # avg R = (3 + 2 - 1 + 0) / 4 = 1.0 assert s["avg_r"] == 1.0 assert s["total_r"] == 4.0 # net = gross minus a 0.04R round trip per candidate (risk_pct 0.05) assert s["net_avg_r"] == pytest.approx(1.0 - _COST_R_005, abs=0.001) assert s["net_total_r"] == pytest.approx(4.0 - 4 * _COST_R_005, abs=0.01) assert s["best_r"] == 3.0 assert s["worst_r"] == -1.0 assert s["avg_hold_days"] == 10.0 assert s["net_r_per_day"] == pytest.approx((1.0 - _COST_R_005) / 10.0, abs=0.001) # robustness: net rs are [2.96, 1.96, -1.04, -0.04] assert s["median_net_r"] == pytest.approx(0.96, abs=0.001) assert s["profit_factor"] == pytest.approx(4.92 / 1.08, abs=0.01) # ex-top-5%: ceil(4 * 0.05) = 1 winner trimmed → mean of the remaining three assert s["net_avg_r_ex_top5"] == pytest.approx((1.96 - 1.04 - 0.04) / 3, abs=0.001) def test_bucket_stats_empty(): s = bt._bucket_stats([]) assert s["total"] == 0 assert s["hit_rate"] is None assert s["avg_r"] is None assert s["net_avg_r"] is None def test_bucket_stats_no_risk_pct_means_no_cost(): c = _cand(50, OUTCOME_TARGET_HIT, 2.0) del c["risk_pct"] s = bt._bucket_stats([c]) assert s["net_avg_r"] == s["avg_r"] assert s["net_total_r"] == s["total_r"] def test_build_recommendation_reads_the_report(): report = { "overall_qualified": {"net_avg_r": 0.13, "net_avg_r_ex_top5": 0.05}, "time_exit_sweep": [ {"hold_days": 21, "net_avg_r": 0.38}, {"hold_days": 30, "net_avg_r": 0.50, "net_avg_r_ex_top5": 0.21}, ], "gate_ablation": [ {"variant": "all_floors", "total": 100, "hold_net_avg_r": 0.50}, {"variant": "no_confidence_floor", "total": 130, "hold_net_avg_r": 0.49}, {"variant": "no_rr_floor", "total": 400, "hold_net_avg_r": 0.34}, {"variant": "no_neutral_exclusion", "total": 120, "hold_net_avg_r": 0.46}, ], "sweep": [ {"min_momentum_percentile": 80.0, "net_avg_r": 0.13, "total": 100}, {"min_momentum_percentile": 60.0, "net_avg_r": 0.05, "total": 300}, {"min_momentum_percentile": 0.0, "net_avg_r": -0.12, "total": 1000}, ], "portfolio_sim": {"policies": [ {"policy": "target", "cagr_pct": 23.7, "total_return_pct": 134.8, "spy_return_pct": 95.9, "max_drawdown_pct": 20.7}, {"policy": "hold", "cagr_pct": 31.9, "total_return_pct": 203.6, "spy_return_pct": 95.9, "max_drawdown_pct": 21.2}, ]}, } rec = bt._build_recommendation(report) by_topic: dict[str, list[str]] = {} for item in rec["items"]: by_topic.setdefault(item["topic"], []).append(item["text"]) assert rec["headline"] is not None and "hold 30" in rec["headline"] assert any("hold 30 trading days" in t for t in by_topic["exit"]) gate_texts = " | ".join(by_topic["gate"]) assert "confidence floor adds nothing" in gate_texts assert "keep the R:R floor" in gate_texts assert "keep the NEUTRAL exclusion" in gate_texts assert "80" in by_topic["cutoff"][0] assert "beats" in by_topic["benchmark"][0] # robustness is judged under the RECOMMENDED exit (the 30d hold), not the # target model the recommendation advises abandoning assert any( "not a handful of outliers" in t and "under the recommended 30d hold" in t for t in by_topic["robustness"] ) def test_build_recommendation_flags_outlier_dependence(): rec = bt._build_recommendation({ "overall_qualified": {"net_avg_r": 0.13, "net_avg_r_ex_top5": -0.02}, }) robustness = [i["text"] for i in rec["items"] if i["topic"] == "robustness"] assert robustness and "WARNING" in robustness[0] def test_window_setups_too_short_returns_empty(): assert bt._window_setups([], {}, {}) == [] def test_replay_ticker_candidates_carry_gate_fields(): """The ablation recomputes floors from candidate fields — a candidate missing action/risk_level silently zeroes the ablation rows (July 2026 regression).""" from app.services.admin_service import ACTIVATION_DEFAULTS from app.services.recommendation_service import DEFAULT_RECOMMENDATION_CONFIG base = date(2025, 1, 1) bars = [] for i in range(160): close = 100.0 + 8.0 * math.sin(i / 6.0) bars.append(SimpleNamespace( date=base + timedelta(days=i), open=close, high=close + 1.5, low=close - 1.5, close=close, volume=1_000_000 + (i % 5) * 1000, )) cands = bt._replay_ticker( "OSC", bars, dict(DEFAULT_RECOMMENDATION_CONFIG), dict(ACTIVATION_DEFAULTS) ) assert cands, "expected the oscillating series to produce candidates" for c in cands: assert c.get("action") is not None assert "risk_level" in c async def _seed_oscillating_ticker(session, symbol: str, n: int = 160) -> None: t = Ticker(symbol=symbol) session.add(t) await session.flush() base = date(2025, 1, 1) for i in range(n): close = 100.0 + 8.0 * math.sin(i / 6.0) session.add(OHLCVRecord( ticker_id=t.id, date=base + timedelta(days=i), open=close, high=close + 1.5, low=close - 1.5, close=close, volume=1_000_000 + (i % 5) * 1000, )) await session.commit() async def test_run_backtest_smoke(session): await _seed_oscillating_ticker(session, "OSC") report = await bt.run_backtest(session) # well-formed report assert report["tickers"] == 1 assert isinstance(report["candidates"], int) for key in ( "overall_qualified", "overall_all", "by_direction", "sweep", "gate_ablation", "time_exit_sweep", "portfolio_sim", "recommendation", ): assert key in report # the oscillating series should yield at least some resolved setups assert report["candidates"] >= 1 # cost assumption is reported, and every bucket carries net numbers assert report["params"]["cost_per_side_pct"] == pytest.approx(bt.COST_PER_SIDE * 100) assert "net_avg_r" in report["overall_all"] # ablation baseline reproduces the qualified set exactly, and every row # carries the hold-to-horizon grading alongside the target model ablation = {r["variant"]: r for r in report["gate_ablation"]} assert ablation["all_floors"]["total"] == report["overall_qualified"]["total"] for row in report["gate_ablation"]: assert "hold_net_avg_r" in row # time-exit sweep covers the configured hold lengths assert [r["hold_days"] for r in report["time_exit_sweep"]] == list(bt.TIME_EXIT_DAYS) # portfolio simulation section is always present (policies may be empty # when nothing qualifies) assert "portfolio_sim" in report assert isinstance(report["portfolio_sim"]["policies"], list) assert report["portfolio_sim"]["params"]["max_positions"] == bt.SIM_MAX_POSITIONS # sweep: lowering the momentum-percentile cutoff can only add qualifiers sweep = sorted(report["sweep"], key=lambda r: r["min_momentum_percentile"], reverse=True) counts = [r["total"] for r in sweep] assert counts == sorted(counts) # ascending as threshold descends