diff --git a/app/services/backtest_service.py b/app/services/backtest_service.py index b17e410..13a8340 100644 --- a/app/services/backtest_service.py +++ b/app/services/backtest_service.py @@ -3,13 +3,21 @@ OHLCV and measure how the CURRENT config would have performed. For each ticker we step through history (weekly), and at each as-of date D we rebuild the setup using only bars ≤ D (no lookahead), then walk the actual bars -after D to record the realized outcome. Two reports come out: +after D to record the realized outcome. The report contains: - - realized hit-rate / expectancy of qualified setups (and of all setups) - - a probability calibration curve: do "60% likely" targets hit ~60% of the time? + - hit-rate / expectancy of qualified setups vs the all-setups control group, + gross and net of costs, with robustness stats (median, profit factor, + expectancy without the top winners) + - the momentum-percentile sweep and the gate ablation (each floor removed in + turn, graded under both the target and the hold-to-horizon exit) + - the time-exit sweep (hold N days with the initial stop) + - cross-sectional factor rank-IC ("signal edge") + - a capital-constrained portfolio simulation (equity curve → CAGR, drawdown, + Sharpe, SPY comparison) + - a data-driven recommendation derived from this report's numbers Limitation: sentiment and fundamentals have no point-in-time history, so they're -held neutral here — this calibrates the price/S-R/probability machinery only. +held neutral here — this calibrates the price/S-R machinery only. """ from __future__ import annotations @@ -20,6 +28,7 @@ import logging import math import multiprocessing import os +import statistics from collections import defaultdict from collections.abc import Callable from concurrent.futures import ProcessPoolExecutor @@ -75,8 +84,6 @@ MIN_LOOKBACK = 60 # bars needed before D for indicators (EMA cross needs 51 HORIZON = 30 # trading days to resolve an outcome (matches the evaluator) ATR_MULTIPLIER = 1.5 -_CAL_BUCKETS = [(0, 20), (20, 40), (40, 60), (60, 80), (80, 100.01)] - # Cross-sectional signal evaluation (factor IC). Each candidate signal is a # point-in-time number computed from closes alone (sentiment/fundamentals have no # history here), sampled one as-of per ISO week, and graded by how its rank @@ -231,102 +238,19 @@ def _stop_fill_r(direction: str, entry: float, stop: float, bar) -> float: return (entry - fill) / risk -def _tp_primitives( +def _risk_and_stop_day( direction: str, entry: float, stop: float, forward: list, horizon: int -) -> tuple[float, bool, float, float, int | None, float]: - """Primitives for the take-profit exit model, from the bars after detection. - - Returns ``(risk_pct, stopped, mfe_pct, close_pct, stop_day, stop_r)``: - - ``risk_pct`` fraction from entry to stop (the 1R distance) - - ``stopped`` whether the stop was hit within the horizon - - ``mfe_pct`` best favourable excursion (fraction) reachable *before* the - stop — strictly before the stop bar, so a same-bar tp+stop - counts as a loss (matching the conservative target model); - over the whole horizon if the stop is never hit - - ``close_pct`` directional return at the horizon-end close (the timeout exit) - - ``stop_day`` 1-based trading day the stop was pierced, None if never - - ``stop_r`` realized R at the stop fill (≤ −1 when the bar gapped - through the stop — see _stop_fill_r); −1.0 when unused - - From these any fixed take-profit level can be scored without re-walking bars: - tp reached before stop (``mfe_pct >= tp``) → +tp; else stop → ``stop_r``; - else the horizon-close move. - """ +) -> tuple[float, int | None]: + """``(risk_pct, stop_day)`` from the bars after detection: the 1R stop + distance as a fraction of entry, and the 1-based trading day the initial + stop was first pierced within the horizon (None if never). Feeds the cost + conversion and the time-exit hold accounting.""" long = direction == "long" risk_pct = abs(entry - stop) / entry if entry else 0.0 - bars = forward[:horizon] - if not bars: - return risk_pct, False, 0.0, 0.0, None, -1.0 - mfe = 0.0 - stopped = False - stop_day: int | None = None - stop_r = -1.0 - for i, r in enumerate(bars): + for i, r in enumerate(forward[:horizon]): if (r.low <= stop) if long else (r.high >= stop): - stopped = True - stop_day = i + 1 - stop_r = _stop_fill_r(direction, entry, stop, r) - break - fav = (r.high - entry) / entry if long else (entry - r.low) / entry - if fav > mfe: - mfe = fav - close_pct = ((bars[-1].close - entry) / entry) * (1.0 if long else -1.0) - return risk_pct, stopped, mfe, close_pct, stop_day, stop_r - - -def _trailing_exits( - direction: str, entry: float, init_stop: float, trail_fracs, forward: list, horizon: int -) -> dict[int, float]: - """Realized R per trailing-stop width, in one pass over the post-entry bars. - - The stop ratchets up (never below the initial stop): ``max(init_stop, - peak*(1-trail))`` for a long. Exit when a bar pierces the current stop (filled - at the stop level), else at the horizon-end close. Each width is keyed by its - integer percent (5 for 0.05). Conservative: the stop for a bar uses the peak - through the *previous* bar (this bar's high is folded in only afterwards). - R is relative to the initial risk (entry → init_stop). - """ - long = direction == "long" - risk = abs(entry - init_stop) / entry if entry else 0.0 - if risk <= 0: - return {round(f * 100): 0.0 for f in trail_fracs} - bars = forward[:horizon] - if not bars: - return {round(f * 100): 0.0 for f in trail_fracs} - - result: dict[int, float] = {} - peak = entry - active = list(trail_fracs) - for r in bars: - remaining = [] - for f in active: - if long: - stop_level = max(init_stop, peak * (1 - f)) - if r.low <= stop_level: - fill = min(stop_level, r.open) # gap through fills at the open - result[round(f * 100)] = ((fill - entry) / entry) / risk - continue - else: - stop_level = min(init_stop, peak * (1 + f)) - if r.high >= stop_level: - fill = max(stop_level, r.open) - result[round(f * 100)] = ((entry - fill) / entry) / risk - continue - remaining.append(f) - active = remaining - if not active: - break - if long: - if r.high > peak: - peak = r.high - elif r.low < peak: - peak = r.low - - last_close = bars[-1].close - timeout_r = (((last_close - entry) / entry) if long else ((entry - last_close) / entry)) / risk - for f in active: - result[round(f * 100)] = timeout_r - return result + return risk_pct, i + 1 + return risk_pct, None def _time_exits( @@ -337,8 +261,8 @@ def _time_exits( The initial stop stays active (fill at the stop level → −1R); otherwise the trade exits at the day-N close (the last available close when history ends early). No target, no trailing — the classic momentum implementation: buy, - hold ~N days, re-rank. Same conservative bar logic as ``_tp_primitives``: a - bar that pierces the stop is a loss before that bar's close counts. + hold ~N days, re-rank. Conservative bar logic: a bar that pierces the stop + is a loss before that bar's close counts. """ long = direction == "long" risk = abs(entry - stop) / entry if entry else 0.0 @@ -405,14 +329,9 @@ def _replay_ticker(symbol: str, records: list, config: dict, activation: dict) - ) else: # expired realized_r = 0.0 - # Take-profit exit primitives (parallel to the target-vs-stop outcome - # above; aggregated separately into the take-profit sweep). - risk_pct, tp_stopped, mfe_pct, tp_close_pct, stop_day, tp_stop_r = _tp_primitives( + risk_pct, stop_day = _risk_and_stop_day( s["direction"], s["entry"], s["stop"], forward, HORIZON ) - trail_r = _trailing_exits( - s["direction"], s["entry"], s["stop"], TRAIL_LEVELS, forward, HORIZON - ) time_r = _time_exits( s["direction"], s["entry"], s["stop"], forward, TIME_EXIT_DAYS ) @@ -441,11 +360,6 @@ def _replay_ticker(symbol: str, records: list, config: dict, activation: dict) - "hold_days": hold_days, "stop_day": stop_day, "risk_pct": risk_pct, - "tp_stopped": tp_stopped, - "tp_stop_r": tp_stop_r, - "mfe_pct": mfe_pct, - "tp_close_pct": tp_close_pct, - "trail_r": trail_r, "time_r": time_r, }) return candidates @@ -461,6 +375,14 @@ def _bucket_stats(cands: list[dict]) -> dict: holds = [c["hold_days"] for c in cands if c.get("hold_days")] avg_hold = sum(holds) / len(holds) if holds else None net_avg = sum(net_rs) / len(net_rs) if net_rs else None + # Robustness: does the edge depend on a handful of outliers? Median and + # profit factor describe the distribution; ex-top-5% is the expectancy with + # the biggest winners removed — if it stays positive, the edge isn't a + # lottery ticket. + gains = sum(r for r in net_rs if r > 0) + losses_abs = -sum(r for r in net_rs if r < 0) + trim_n = math.ceil(len(net_rs) * 0.05) if net_rs else 0 + trimmed = sorted(net_rs, reverse=True)[trim_n:] if net_rs else [] return { "total": len(cands), "wins": wins, @@ -478,17 +400,18 @@ def _bucket_stats(cands: list[dict]) -> dict: "net_r_per_day": ( round(net_avg / avg_hold, 4) if net_avg is not None and avg_hold else None ), + "median_net_r": round(statistics.median(net_rs), 3) if net_rs else None, + "profit_factor": round(gains / losses_abs, 2) if losses_abs > 0 else None, + "net_avg_r_ex_top5": ( + round(sum(trimmed) / len(trimmed), 3) if trimmed else None + ), } -# Fixed take-profit levels (fractions) swept for the take-profit exit model. -# Extended into the tail so the avg-R peak/plateau is visible (it's where letting -# winners run stops paying). Note: this model ignores the setup's S/R target — -# it's a standalone fixed-% exit; exiting at the target is the target model. -TP_LEVELS = (0.04, 0.06, 0.08, 0.10, 0.12, 0.15, 0.20, 0.25, 0.30, 0.40, 0.50) - -# Trailing-stop widths (give-back from the peak) swept for the trailing exit model. -TRAIL_LEVELS = (0.03, 0.05, 0.07, 0.10, 0.15, 0.20, 0.25, 0.30) +# The fixed take-profit and trailing-stop sweeps were retired 2026-07: swept +# TPs never found an interior optimum (momentum's edge lives in the right tail) +# and wide trails converged to the hold-to-horizon exit, so the time-exit sweep +# is the exit-decision surface. # Hold-N-days exits (initial stop stays active, exit at the day-N close) — the # classic cross-sectional momentum implementation: buy, hold ~a month, re-rank. @@ -507,65 +430,6 @@ def _cost_r(cand: dict) -> float: return (2.0 * COST_PER_SIDE) / risk if risk > 0 else 0.0 -def _take_profit_bucket(cands: list[dict], tp: float) -> dict: - """Stats for a fixed take-profit exit at +``tp`` (fraction): bank +tp if it's - reached before the stop, else −1R on a stop, else exit at the horizon close. - Results are in R (gain% / risk%) so they're comparable to the target model. - ``hit_rate`` here = share that reached +tp before the stop (the MFE CDF).""" - rs: list[float] = [] - net_rs: list[float] = [] - wins = 0 - for c in cands: - risk = c.get("risk_pct") or 0.0 - if risk <= 0: - continue - if c.get("mfe_pct", 0.0) >= tp: - r = tp / risk - wins += 1 - elif c.get("tp_stopped"): - r = c.get("tp_stop_r", -1.0) # gap-aware stop fill, ≤ −1R - else: - r = (c.get("tp_close_pct", 0.0)) / risk - rs.append(r) - net_rs.append(r - _cost_r(c)) - total = len(rs) - return { - "tp_pct": round(tp * 100, 1), - "total": total, - "wins": wins, - "hit_rate": round(wins / total * 100, 1) if total else None, - "avg_r": round(sum(rs) / total, 3) if total else None, - "total_r": round(sum(rs), 2) if total else None, - "net_avg_r": round(sum(net_rs) / total, 3) if total else None, - "net_total_r": round(sum(net_rs), 2) if total else None, - } - - -def _trailing_bucket(cands: list[dict], trail_pct: int) -> dict: - """Stats for a trailing-stop exit of width ``trail_pct`` (integer percent). - Each candidate carries its realized R for this width in ``trail_r``; a "win" - is simply an exit in profit (R > 0).""" - pairs = [ - (c["trail_r"][trail_pct], _cost_r(c)) - for c in cands - if c.get("trail_r", {}).get(trail_pct) is not None - ] - total = len(pairs) - rs = [r for r, _ in pairs] - net_rs = [r - cost for r, cost in pairs] - wins = sum(1 for r in rs if r > 0) - return { - "trail_pct": trail_pct, - "total": total, - "wins": wins, - "win_rate": round(wins / total * 100, 1) if total else None, - "avg_r": round(sum(rs) / total, 3) if total else None, - "total_r": round(sum(rs), 2) if total else None, - "net_avg_r": round(sum(net_rs) / total, 3) if total else None, - "net_total_r": round(sum(net_rs), 2) if total else None, - } - - def _time_exit_bucket(cands: list[dict], hold_days: int) -> dict: """Stats for the hold-``hold_days`` exit: initial stop active, otherwise out at the day-N close. Each candidate carries its realized R per hold length in @@ -605,23 +469,6 @@ def _time_exit_bucket(cands: list[dict], hold_days: int) -> dict: } -def _calibration(cands: list[dict]) -> list[dict]: - """Predicted target probability vs realized hit rate, per probability bucket.""" - rows: list[dict] = [] - for lo, hi in _CAL_BUCKETS: - bucket = [c for c in cands if lo <= c["primary_prob"] < hi] - if not bucket: - continue - hits = sum(1 for c in bucket if c["target_hit"]) - rows.append({ - "bucket": f"{int(lo)}-{int(min(hi, 100))}%", - "n": len(bucket), - "predicted_avg": round(sum(c["primary_prob"] for c in bucket) / len(bucket), 1), - "realized_hit_rate": round(hits / len(bucket) * 100, 1), - }) - return rows - - # --------------------------------------------------------------------------- # Cross-sectional signal evaluation (factor information-coefficient) # --------------------------------------------------------------------------- @@ -1172,6 +1019,31 @@ def _simulate_portfolio( if var > 0: sharpe = mean / math.sqrt(var) * math.sqrt(252) + # Per-calendar-year returns off the equity curve — shows whether every year + # contributed or one exceptional stretch carried the result. + yearly: list[dict] = [] + year_start_eq = curve[0][1] + cur_year = date.fromordinal(curve[0][0]).year + last_eq = curve[0][1] + for o, eq in curve: + y = date.fromordinal(o).year + if y != cur_year: + yearly.append({ + "year": cur_year, + "return_pct": ( + round((last_eq / year_start_eq - 1) * 100, 1) if year_start_eq > 0 else None + ), + }) + cur_year = y + year_start_eq = last_eq + last_eq = eq + yearly.append({ + "year": cur_year, + "return_pct": ( + round((last_eq / year_start_eq - 1) * 100, 1) if year_start_eq > 0 else None + ), + }) + pnls = [t["pnl"] for t in trades] wins = sum(1 for p in pnls if p > 0) spy_pct = None @@ -1201,11 +1073,163 @@ def _simulate_portfolio( ), "skipped_book_full": skipped_full, "spy_return_pct": round(spy_pct, 1) if spy_pct is not None else None, + "yearly_returns": yearly, "start_date": date.fromordinal(calendar[0]).isoformat(), "end_date": date.fromordinal(calendar[-1]).isoformat(), } +# --------------------------------------------------------------------------- +# Data-driven recommendation +# --------------------------------------------------------------------------- + +# A floor whose removal costs less than this (R net per trade, under the hold +# exit) is judged not to be pulling its weight. +_FLOOR_KEEP_THRESHOLD = 0.02 +# The hold exit must beat the target exit by at least this much to be advised. +_EXIT_SWITCH_THRESHOLD = 0.05 + + +def _build_recommendation(report: dict) -> dict: + """Strategy advice derived from THIS report's numbers — recomputed every + run, so if the data flips, the advice flips. Rules are deliberately simple + and transparent; thresholds are module constants above.""" + items: list[dict] = [] + q = report.get("overall_qualified") or {} + target_net = q.get("net_avg_r") + + # Exit policy: the production target/stop race vs the best fixed hold. + time_rows = [r for r in report.get("time_exit_sweep") or [] if r.get("net_avg_r") is not None] + best_hold = max(time_rows, key=lambda r: r["net_avg_r"], default=None) + sim_rows = { + p.get("policy"): p + for p in (report.get("portfolio_sim") or {}).get("policies", []) + } + hold_sim = sim_rows.get("hold") + if best_hold is not None and target_net is not None: + if best_hold["net_avg_r"] > target_net + _EXIT_SWITCH_THRESHOLD: + text = ( + f"Exit: hold {best_hold['hold_days']} trading days with the initial stop " + f"({best_hold['net_avg_r']:+.2f}R net/trade vs {target_net:+.2f}R for the S/R target exit)." + ) + target_sim = sim_rows.get("target") + if ( + hold_sim is not None and target_sim is not None + and hold_sim.get("cagr_pct") is not None and target_sim.get("cagr_pct") is not None + ): + text += ( + f" The simulated book agrees: {hold_sim['cagr_pct']:+.1f}% vs " + f"{target_sim['cagr_pct']:+.1f}% CAGR at similar drawdown." + ) + items.append({"topic": "exit", "text": text}) + else: + items.append({ + "topic": "exit", + "text": ( + f"Exit: keep the S/R target exit ({target_net:+.2f}R net/trade) — " + "no fixed hold beats it by a meaningful margin." + ), + }) + + # Gate floors, judged under the hold exit (the ablation's Hold column). + ablation = {r["variant"]: r for r in report.get("gate_ablation") or []} + base_row = ablation.get("all_floors") + base_hold = (base_row or {}).get("hold_net_avg_r") + floor_labels = { + "no_confidence_floor": "confidence floor", + "no_rr_floor": "R:R floor", + "no_neutral_exclusion": "NEUTRAL exclusion", + } + if base_hold is not None: + for variant, label in floor_labels.items(): + row = ablation.get(variant) + if row is None or row.get("hold_net_avg_r") is None: + continue + delta = base_hold - row["hold_net_avg_r"] + extra = row["total"] - base_row["total"] + if delta <= _FLOOR_KEEP_THRESHOLD: + items.append({ + "topic": "gate", + "text": ( + f"Gate: the {label} adds nothing — dropping it costs {delta:+.2f}R/trade " + f"and adds {extra} trades." + ), + }) + else: + items.append({ + "topic": "gate", + "text": f"Gate: keep the {label} (worth {delta:+.2f}R/trade under the hold exit).", + }) + + # Momentum cutoff: best per-trade net among the active-gate sweep rows. + sweep_rows = [ + r for r in report.get("sweep") or [] + if r.get("net_avg_r") is not None and (r.get("min_momentum_percentile") or 0) > 0 + ] + if sweep_rows: + best_cut = max(sweep_rows, key=lambda r: r["net_avg_r"]) + items.append({ + "topic": "cutoff", + "text": ( + f"Momentum cutoff: {best_cut['min_momentum_percentile']:.0f} has the best " + f"per-trade net ({best_cut['net_avg_r']:+.2f}R over {best_cut['total']} setups)." + ), + }) + + # Book vs benchmark. + book = hold_sim or sim_rows.get("target") + if book is not None and book.get("spy_return_pct") is not None: + edge = book["total_return_pct"] - book["spy_return_pct"] + verdict = "beats" if edge > 0 else "LAGS" + items.append({ + "topic": "benchmark", + "text": ( + f"Book vs SPY: {verdict} buy-and-hold by {edge:+.1f} points " + f"({book['total_return_pct']:+.1f}% vs {book['spy_return_pct']:+.1f}%), " + f"max drawdown −{book['max_drawdown_pct']:.1f}%." + ), + }) + + # Robustness: does the edge survive without the biggest winners? + trimmed = q.get("net_avg_r_ex_top5") + if trimmed is not None: + if trimmed > 0: + items.append({ + "topic": "robustness", + "text": ( + f"Robustness: expectancy survives removing the top 5% of winners " + f"({trimmed:+.2f}R net/trade) — the edge is not a handful of outliers." + ), + }) + else: + items.append({ + "topic": "robustness", + "text": ( + f"Robustness WARNING: without the top 5% of winners the edge disappears " + f"({trimmed:+.2f}R net/trade) — outlier-dependent, treat the headline " + "expectancy with caution." + ), + }) + + headline = None + if best_hold is not None and target_net is not None and best_hold["net_avg_r"] > target_net + _EXIT_SWITCH_THRESHOLD: + cagr_note = ( + f" (~{hold_sim['cagr_pct']:.0f}% CAGR simulated)" + if hold_sim is not None and hold_sim.get("cagr_pct") is not None + else "" + ) + headline = ( + f"Trade the qualified list long-only; hold {best_hold['hold_days']} trading days " + f"with the initial ATR stop{cagr_note}." + ) + + return { + "headline": headline, + "items": items, + "note": "Derived from this report's numbers on every run — the advice flips if the data does.", + } + + async def run_backtest( db: AsyncSession, progress_cb: Callable[[int, int, str], None] | None = None, @@ -1346,7 +1370,7 @@ async def run_backtest( except Exception: logger.exception("Portfolio simulation failed") - return { + report = { "generated_at": datetime.now(timezone.utc).isoformat(), "tickers": total, "candidates": len(candidates), @@ -1376,8 +1400,6 @@ async def run_backtest( "instead of the S/R target — the view that matters if the exit " "policy moves to a fixed hold." ), - "take_profit_sweep": [_take_profit_bucket(qualified, tp) for tp in TP_LEVELS], - "trailing_sweep": [_trailing_bucket(qualified, round(f * 100)) for f in TRAIL_LEVELS], "time_exit_sweep": [_time_exit_bucket(qualified, n) for n in TIME_EXIT_DAYS], "portfolio_sim": { "params": { @@ -1401,7 +1423,6 @@ async def run_backtest( "same window. In-sample; no dividends." ), }, - "calibration": _calibration(candidates), "signal_eval": _signal_evaluation(collected), "signal_eval_note": ( "Cross-sectional rank-IC of price-only signals vs the forward " @@ -1421,6 +1442,8 @@ async def run_backtest( "~6 months ≈ one market regime — treat as directional, not gospel." ), } + report["recommendation"] = _build_recommendation(report) + return report async def run_and_store( diff --git a/frontend/src/components/signals/BacktestPanel.tsx b/frontend/src/components/signals/BacktestPanel.tsx index cb3a4fb..a05d09e 100644 --- a/frontend/src/components/signals/BacktestPanel.tsx +++ b/frontend/src/components/signals/BacktestPanel.tsx @@ -124,14 +124,6 @@ export function BacktestPanel() { const queryClient = useQueryClient(); const toast = useToast(); - const bestTpAvgR = - report?.take_profit_sweep && report.take_profit_sweep.length > 0 - ? Math.max(...report.take_profit_sweep.map((r) => netOrGross(r) ?? -Infinity)) - : null; - const bestTrailAvgR = - report?.trailing_sweep && report.trailing_sweep.length > 0 - ? Math.max(...report.trailing_sweep.map((r) => netOrGross(r) ?? -Infinity)) - : null; const bestTimeAvgR = report?.time_exit_sweep && report.time_exit_sweep.length > 0 ? Math.max(...report.time_exit_sweep.map((r) => netOrGross(r) ?? -Infinity)) @@ -189,6 +181,30 @@ export function BacktestPanel() { )}
+ {report.recommendation && report.recommendation.items.length > 0 && ( +What this backtest recommends
+ {report.recommendation.headline && ( ++ {report.recommendation.headline} +
+ )} +{report.recommendation.note}
+ )} +- Take-profit exit (alternative to the target above) -
-- Models a realistic exit instead of waiting for the far S/R target: bank{' '} - +X% if price reaches it before the stop, else the - stop-fill loss (a gap through the stop fills at the open, so it can exceed −1R), else exit - at the {report.params.horizon_days}-day close. In R, so it compares to the - target model above. Hit Rate = how often you'd have banked - +X% (how far winners actually run) — no top-ticking, it's the level you'd really set. - The setup's own S/R target is not used here (exiting at that target is the model - above); this is a pure fixed-% exit. ★ = best net avg R. -
-| Take-profit | -Setups | -Hit (banked) | -Hit Rate | -Avg R | -Net Avg R | -Total R | -
|---|---|---|---|---|---|---|
| - {best && ★} - +{row.tp_pct}% - | -{row.total} | -{row.wins} | -{fmtPct(row.hit_rate)} | -{fmtR(row.avg_r)} | -{fmtR(row.net_avg_r ?? null)} | -{fmtR(row.total_r)} | -
- Trailing-stop exit -
-- Let it run, but exit when price gives back X% from its - peak (the stop only ratchets up, never below the initial stop). Captures the tail - without the fixed take-profit's all-or-nothing miss, and protects gains. In R vs the initial - risk. Win Rate = share closed in profit. ★ = best net avg R. -
-| Trail | -Setups | -Profitable | -Win Rate | -Avg R | -Net Avg R | -Total R | -
|---|---|---|---|---|---|---|
| - {best && ★} - {row.trail_pct}% - | -{row.total} | -{row.wins} | -{fmtPct(row.win_rate)} | -{fmtR(row.avg_r)} | -{fmtR(row.net_avg_r ?? null)} | -{fmtR(row.total_r)} | -
@@ -543,6 +483,16 @@ export function BacktestPanel() { ['Avg P&L / trade', (p) => fmtMoney(p.avg_trade_pnl), (p) => rColor(p.avg_trade_pnl)], ['Best / worst trade', (p) => `${fmtR(p.best_trade_r)} / ${fmtR(p.worst_trade_r)}`, () => 'text-gray-300'], ['Avg holding time', (p) => fmtDays(p.avg_hold_days), () => 'text-gray-300'], + [ + 'Per-year returns', + (p) => + p.yearly_returns && p.yearly_returns.length > 0 + ? p.yearly_returns + .map((y) => `${y.year} ${fmtSignedPct(y.return_pct)}`) + .join(' · ') + : '—', + () => 'text-gray-300', + ], ['Entries skipped (book full)', (p) => String(p.skipped_book_full), () => 'text-gray-500'], ] as [string, (p: BacktestPortfolioPolicy) => string, (p: BacktestPortfolioPolicy) => string][] ).map(([label, fmt, color]) => ( @@ -561,47 +511,6 @@ export function BacktestPanel() {
- Probability calibration -
-- Do targets we call “X% likely” actually hit that often? Realized below predicted = - the model is over-confident. -
- {report.calibration.length === 0 ? ( -| Predicted Bucket | -Setups | -Avg Predicted | -Realized Hit Rate | -
|---|---|---|---|
| {row.bucket} | -{row.n} | -{row.predicted_avg.toFixed(0)}% | -- {row.realized_hit_rate.toFixed(0)}% - | -
diff --git a/frontend/src/lib/types.ts b/frontend/src/lib/types.ts index ea8a8c0..99ac2df 100644 --- a/frontend/src/lib/types.ts +++ b/frontend/src/lib/types.ts @@ -236,41 +236,16 @@ export interface BacktestBucket { worst_r?: number | null; avg_hold_days?: number | null; net_r_per_day?: number | null; -} - -export interface BacktestCalibrationRow { - bucket: string; - n: number; - predicted_avg: number; - realized_hit_rate: number; + // Robustness: distribution shape, and expectancy without the top winners. + median_net_r?: number | null; + profit_factor?: number | null; + net_avg_r_ex_top5?: number | null; } export interface BacktestSweepRow extends BacktestBucket { min_momentum_percentile: number; } -export interface BacktestTakeProfitRow { - tp_pct: number; - total: number; - wins: number; - hit_rate: number | null; - avg_r: number | null; - total_r: number | null; - net_avg_r?: number | null; - net_total_r?: number | null; -} - -export interface BacktestTrailingRow { - trail_pct: number; - total: number; - wins: number; - win_rate: number | null; - avg_r: number | null; - total_r: number | null; - net_avg_r?: number | null; - net_total_r?: number | null; -} - export interface BacktestTimeExitRow { hold_days: number; total: number; @@ -304,10 +279,17 @@ export interface BacktestPortfolioPolicy { avg_hold_days: number | null; skipped_book_full: number; spy_return_pct: number | null; + yearly_returns?: { year: number; return_pct: number | null }[]; start_date: string; end_date: string; } +export interface BacktestRecommendation { + headline: string | null; + items: { topic: string; text: string }[]; + note?: string; +} + export interface BacktestPortfolioSim { params: { starting_capital: number; @@ -359,11 +341,9 @@ export interface BacktestReport { sweep: BacktestSweepRow[]; gate_ablation?: BacktestGateAblationRow[]; gate_ablation_note?: string; - take_profit_sweep?: BacktestTakeProfitRow[]; - trailing_sweep?: BacktestTrailingRow[]; time_exit_sweep?: BacktestTimeExitRow[]; portfolio_sim?: BacktestPortfolioSim; - calibration: BacktestCalibrationRow[]; + recommendation?: BacktestRecommendation; signal_eval?: BacktestSignalEvalRow[]; signal_eval_note?: string; note: string; diff --git a/tests/unit/test_backtest_service.py b/tests/unit/test_backtest_service.py index 53c31a5..c47ca7c 100644 --- a/tests/unit/test_backtest_service.py +++ b/tests/unit/test_backtest_service.py @@ -75,116 +75,21 @@ class TestStopFillR: assert bt._stop_fill_r("short", 100.0, 105.0, _bar(110, 104, 108, open_=107)) == pytest.approx(-1.4) -class TestTakeProfitPrimitives: - def test_long_tp_reachable_before_stop(self): - risk, stopped, mfe, close_pct, stop_day, _ = bt._tp_primitives("long", 100.0, 95.0, [_bar(109, 101, 108)], 30) +class TestRiskAndStopDay: + def test_no_stop(self): + risk, stop_day = bt._risk_and_stop_day("long", 100.0, 95.0, [_bar(109, 101, 108)], 30) assert risk == pytest.approx(0.05) - assert stopped is False - assert mfe == pytest.approx(0.09) - assert close_pct == pytest.approx(0.08) assert stop_day is None - def test_long_stop_zeroes_mfe(self): - # Low pierces the stop on the only bar → loss, nothing banked before it. - risk, stopped, mfe, close_pct, stop_day, stop_r = bt._tp_primitives("long", 100.0, 95.0, [_bar(101, 94, 96)], 30) - assert stopped is True - assert mfe == pytest.approx(0.0) - assert close_pct == pytest.approx(-0.04) - assert stop_day == 1 - assert stop_r == pytest.approx(-1.0) - - def test_gap_through_stop_loses_more_than_1r(self): - _, stopped, _, _, stop_day, stop_r = bt._tp_primitives( - "long", 100.0, 95.0, [_bar(93, 90, 91, open_=92)], 30 - ) - assert stopped is True - assert stop_day == 1 - assert stop_r == pytest.approx(-1.6) # filled at the 92 open, not the 95 stop - - def test_long_drift_no_trigger(self): - bars = [_bar(102, 99, 101), _bar(103, 100, 102)] - risk, stopped, mfe, close_pct, _, _ = bt._tp_primitives("long", 100.0, 95.0, bars, 30) - assert stopped is False - assert mfe == pytest.approx(0.03) - assert close_pct == pytest.approx(0.02) + def test_stop_day_is_one_based(self): + bars = [_bar(102, 99, 101), _bar(101, 94, 96)] + risk, stop_day = bt._risk_and_stop_day("long", 100.0, 95.0, bars, 30) + assert risk == pytest.approx(0.05) + assert stop_day == 2 def test_short_direction(self): - # short entry 100, stop 105; price falls → favourable = (entry - low)/entry - risk, stopped, mfe, close_pct, _, _ = bt._tp_primitives("short", 100.0, 105.0, [_bar(101, 92, 93)], 30) - assert risk == pytest.approx(0.05) - assert stopped is False - assert mfe == pytest.approx(0.08) - assert close_pct == pytest.approx(0.07) - - -class TestTakeProfitBucket: - def test_bucket_mix(self): - cands = [ - {"risk_pct": 0.05, "mfe_pct": 0.09, "tp_stopped": False, "tp_close_pct": 0.08}, # +1.6R win - {"risk_pct": 0.05, "mfe_pct": 0.02, "tp_stopped": True, "tp_close_pct": -0.04}, # -1R stop - {"risk_pct": 0.05, "mfe_pct": 0.03, "tp_stopped": False, "tp_close_pct": 0.01}, # +0.2R timeout - ] - b = bt._take_profit_bucket(cands, 0.08) - assert b["total"] == 3 - assert b["wins"] == 1 - assert b["hit_rate"] == pytest.approx(33.3, abs=0.1) - assert b["total_r"] == pytest.approx(0.8, abs=0.01) - assert b["avg_r"] == pytest.approx(0.267, abs=0.01) - # net: minus a 0.04R round trip per candidate (risk_pct 0.05) - assert b["net_total_r"] == pytest.approx(0.8 - 3 * _COST_R_005, abs=0.01) - assert b["net_avg_r"] == pytest.approx((0.8 - 3 * _COST_R_005) / 3, abs=0.01) - - def test_zero_risk_skipped(self): - cands = [{"risk_pct": 0.0, "mfe_pct": 0.2, "tp_stopped": False, "tp_close_pct": 0.1}] - b = bt._take_profit_bucket(cands, 0.08) - assert b["total"] == 0 - assert b["avg_r"] is None - - -class TestTrailingExits: - def test_locks_gain_on_pullback(self): - # Runs to 120, then a 10% trail (from peak 120 → 108) is pierced on the drop. - res = bt._trailing_exits("long", 100.0, 90.0, (0.10,), [_bar(120, 110, 118), _bar(130, 100, 105)], 30) - assert res[10] == pytest.approx(0.8) # (108-100)/100 / 0.10 risk - - def test_initial_stop_caps_loss(self): - # Trail (20%) is looser than the initial stop → initial stop governs = -1R. - res = bt._trailing_exits("long", 100.0, 90.0, (0.20,), [_bar(101, 89, 90)], 30) - assert res[20] == pytest.approx(-1.0) - - def test_timeout_exits_at_close(self): - res = bt._trailing_exits("long", 100.0, 90.0, (0.20,), [_bar(105, 98, 104), _bar(106, 100, 105)], 30) - assert res[20] == pytest.approx(0.5) # close 105 → +5% / 10% risk - - def test_multiple_widths_one_pass(self): - # Tighter trail locks in more here (exit at 114 vs 108). - res = bt._trailing_exits("long", 100.0, 90.0, (0.10, 0.05), [_bar(120, 110, 118), _bar(130, 100, 105)], 30) - assert res[10] == pytest.approx(0.8) - assert res[5] == pytest.approx(1.4) - - def test_gap_through_stop_fills_at_open(self): - # Initial stop 90 governs (20% trail from peak 100 is lower); the bar - # opens at 85, below it → fill at the open. - res = bt._trailing_exits("long", 100.0, 90.0, (0.20,), [_bar(88, 84, 86, open_=85)], 30) - assert res[20] == pytest.approx(-1.5) - - -class TestTrailingBucket: - def test_bucket(self): - cands = [ - {"trail_r": {5: 1.4, 10: 0.8}, "risk_pct": 0.10}, - {"trail_r": {5: -1.0, 10: -1.0}, "risk_pct": 0.10}, - {"trail_r": {5: 0.5, 10: 0.5}, "risk_pct": 0.10}, - ] - b = bt._trailing_bucket(cands, 5) - assert b["total"] == 3 - assert b["wins"] == 2 - assert b["win_rate"] == pytest.approx(66.7, abs=0.1) - assert b["total_r"] == pytest.approx(0.9, abs=0.01) - assert b["avg_r"] == pytest.approx(0.3, abs=0.01) - # net: 0.02R round trip per candidate (risk_pct 0.10) - assert b["net_total_r"] == pytest.approx(0.9 - 3 * 0.02, abs=0.01) - assert b["net_avg_r"] == pytest.approx(0.28, abs=0.01) + _, stop_day = bt._risk_and_stop_day("short", 100.0, 105.0, [_bar(106, 101, 104)], 30) + assert stop_day == 1 class TestTimeExits: @@ -357,6 +262,9 @@ class TestSimulatePortfolio: assert sim["max_drawdown_pct"] == 0.0 assert sim["cagr_pct"] is None # window far too short to annualize assert sim["spy_return_pct"] is None + assert sim["yearly_returns"] == [ + {"year": 2025, "return_pct": pytest.approx(1.2, abs=0.05)} + ] def test_target_policy_exits_at_target(self): closes = [100.0, 102.0, 104.0, 106.0, 108.0, 110.0] @@ -405,6 +313,11 @@ def test_bucket_stats_counts_and_expectancy(): assert s["worst_r"] == -1.0 assert s["avg_hold_days"] == 10.0 assert s["net_r_per_day"] == pytest.approx((1.0 - _COST_R_005) / 10.0, abs=0.001) + # robustness: net rs are [2.96, 1.96, -1.04, -0.04] + assert s["median_net_r"] == pytest.approx(0.96, abs=0.001) + assert s["profit_factor"] == pytest.approx(4.92 / 1.08, abs=0.01) + # ex-top-5%: ceil(4 * 0.05) = 1 winner trimmed → mean of the remaining three + assert s["net_avg_r_ex_top5"] == pytest.approx((1.96 - 1.04 - 0.04) / 3, abs=0.001) def test_bucket_stats_empty(): @@ -423,18 +336,53 @@ def test_bucket_stats_no_risk_pct_means_no_cost(): assert s["net_total_r"] == s["total_r"] -def test_calibration_buckets(): - cands = [ - _cand(65, OUTCOME_TARGET_HIT, 2.0), - _cand(62, OUTCOME_STOP_HIT, 2.0), - _cand(15, OUTCOME_STOP_HIT, 2.0), - ] - rows = bt._calibration(cands) - by_bucket = {r["bucket"]: r for r in rows} - assert by_bucket["60-80%"]["n"] == 2 - assert by_bucket["60-80%"]["realized_hit_rate"] == 50.0 # 1 of 2 hit - assert by_bucket["0-20%"]["n"] == 1 - assert by_bucket["0-20%"]["realized_hit_rate"] == 0.0 +def test_build_recommendation_reads_the_report(): + report = { + "overall_qualified": {"net_avg_r": 0.13, "net_avg_r_ex_top5": 0.05}, + "time_exit_sweep": [ + {"hold_days": 21, "net_avg_r": 0.38}, + {"hold_days": 30, "net_avg_r": 0.50}, + ], + "gate_ablation": [ + {"variant": "all_floors", "total": 100, "hold_net_avg_r": 0.50}, + {"variant": "no_confidence_floor", "total": 130, "hold_net_avg_r": 0.49}, + {"variant": "no_rr_floor", "total": 400, "hold_net_avg_r": 0.34}, + {"variant": "no_neutral_exclusion", "total": 120, "hold_net_avg_r": 0.46}, + ], + "sweep": [ + {"min_momentum_percentile": 80.0, "net_avg_r": 0.13, "total": 100}, + {"min_momentum_percentile": 60.0, "net_avg_r": 0.05, "total": 300}, + {"min_momentum_percentile": 0.0, "net_avg_r": -0.12, "total": 1000}, + ], + "portfolio_sim": {"policies": [ + {"policy": "target", "cagr_pct": 23.7, "total_return_pct": 134.8, + "spy_return_pct": 95.9, "max_drawdown_pct": 20.7}, + {"policy": "hold", "cagr_pct": 31.9, "total_return_pct": 203.6, + "spy_return_pct": 95.9, "max_drawdown_pct": 21.2}, + ]}, + } + rec = bt._build_recommendation(report) + by_topic: dict[str, list[str]] = {} + for item in rec["items"]: + by_topic.setdefault(item["topic"], []).append(item["text"]) + + assert rec["headline"] is not None and "hold 30" in rec["headline"] + assert any("hold 30 trading days" in t for t in by_topic["exit"]) + gate_texts = " | ".join(by_topic["gate"]) + assert "confidence floor adds nothing" in gate_texts + assert "keep the R:R floor" in gate_texts + assert "keep the NEUTRAL exclusion" in gate_texts + assert "80" in by_topic["cutoff"][0] + assert "beats" in by_topic["benchmark"][0] + assert any("not a handful of outliers" in t for t in by_topic["robustness"]) + + +def test_build_recommendation_flags_outlier_dependence(): + rec = bt._build_recommendation({ + "overall_qualified": {"net_avg_r": 0.13, "net_avg_r_ex_top5": -0.02}, + }) + robustness = [i["text"] for i in rec["items"] if i["topic"] == "robustness"] + assert robustness and "WARNING" in robustness[0] def test_window_setups_too_short_returns_empty(): @@ -495,8 +443,8 @@ async def test_run_backtest_smoke(session): assert report["tickers"] == 1 assert isinstance(report["candidates"], int) for key in ( - "overall_qualified", "overall_all", "by_direction", "calibration", "sweep", - "gate_ablation", "time_exit_sweep", + "overall_qualified", "overall_all", "by_direction", "sweep", + "gate_ablation", "time_exit_sweep", "portfolio_sim", "recommendation", ): assert key in report # the oscillating series should yield at least some resolved setups @@ -526,7 +474,3 @@ async def test_run_backtest_smoke(session): sweep = sorted(report["sweep"], key=lambda r: r["min_momentum_percentile"], reverse=True) counts = [r["total"] for r in sweep] assert counts == sorted(counts) # ascending as threshold descends - # every calibration row is internally consistent - for row in report["calibration"]: - assert 0 <= row["realized_hit_rate"] <= 100 - assert row["n"] >= 1