diff --git a/app/services/backtest_service.py b/app/services/backtest_service.py index 13a8340..9adacbf 100644 --- a/app/services/backtest_service.py +++ b/app/services/backtest_service.py @@ -375,14 +375,6 @@ def _bucket_stats(cands: list[dict]) -> dict: holds = [c["hold_days"] for c in cands if c.get("hold_days")] avg_hold = sum(holds) / len(holds) if holds else None net_avg = sum(net_rs) / len(net_rs) if net_rs else None - # Robustness: does the edge depend on a handful of outliers? Median and - # profit factor describe the distribution; ex-top-5% is the expectancy with - # the biggest winners removed — if it stays positive, the edge isn't a - # lottery ticket. - gains = sum(r for r in net_rs if r > 0) - losses_abs = -sum(r for r in net_rs if r < 0) - trim_n = math.ceil(len(net_rs) * 0.05) if net_rs else 0 - trimmed = sorted(net_rs, reverse=True)[trim_n:] if net_rs else [] return { "total": len(cands), "wins": wins, @@ -400,7 +392,21 @@ def _bucket_stats(cands: list[dict]) -> dict: "net_r_per_day": ( round(net_avg / avg_hold, 4) if net_avg is not None and avg_hold else None ), - "median_net_r": round(statistics.median(net_rs), 3) if net_rs else None, + **_robustness_stats(net_rs), + } + + +def _robustness_stats(net_rs: list[float]) -> dict: + """Distribution-shape stats: the median (typical) trade, gross wins vs + losses, and the expectancy with the top 5% of winners removed — the direct + test of whether the edge depends on a handful of outliers.""" + if not net_rs: + return {"median_net_r": None, "profit_factor": None, "net_avg_r_ex_top5": None} + gains = sum(r for r in net_rs if r > 0) + losses_abs = -sum(r for r in net_rs if r < 0) + trimmed = sorted(net_rs, reverse=True)[math.ceil(len(net_rs) * 0.05):] + return { + "median_net_r": round(statistics.median(net_rs), 3), "profit_factor": round(gains / losses_abs, 2) if losses_abs > 0 else None, "net_avg_r_ex_top5": ( round(sum(trimmed) / len(trimmed), 3) if trimmed else None @@ -466,6 +472,7 @@ def _time_exit_bucket(cands: list[dict], hold_days: int) -> dict: "net_r_per_day": ( round(net_avg / avg_hold, 4) if net_avg is not None and avg_hold else None ), + **_robustness_stats(net_rs), } @@ -1190,15 +1197,27 @@ def _build_recommendation(report: dict) -> dict: ), }) - # Robustness: does the edge survive without the biggest winners? - trimmed = q.get("net_avg_r_ex_top5") + # Robustness: does the edge survive without the biggest winners? Judged on + # the RECOMMENDED exit — outlier dependence under an exit we'd abandon + # would be the wrong warning. + hold_recommended = ( + best_hold is not None and target_net is not None + and best_hold["net_avg_r"] > target_net + _EXIT_SWITCH_THRESHOLD + ) + if hold_recommended and best_hold.get("net_avg_r_ex_top5") is not None: + trimmed = best_hold["net_avg_r_ex_top5"] + basis = f"under the recommended {best_hold['hold_days']}d hold" + else: + trimmed = q.get("net_avg_r_ex_top5") + basis = "under the S/R target exit" if trimmed is not None: if trimmed > 0: items.append({ "topic": "robustness", "text": ( f"Robustness: expectancy survives removing the top 5% of winners " - f"({trimmed:+.2f}R net/trade) — the edge is not a handful of outliers." + f"({trimmed:+.2f}R net/trade {basis}) — the edge is not a handful " + "of outliers." ), }) else: @@ -1206,13 +1225,13 @@ def _build_recommendation(report: dict) -> dict: "topic": "robustness", "text": ( f"Robustness WARNING: without the top 5% of winners the edge disappears " - f"({trimmed:+.2f}R net/trade) — outlier-dependent, treat the headline " - "expectancy with caution." + f"({trimmed:+.2f}R net/trade {basis}) — outlier-dependent, treat the " + "headline expectancy with caution." ), }) headline = None - if best_hold is not None and target_net is not None and best_hold["net_avg_r"] > target_net + _EXIT_SWITCH_THRESHOLD: + if hold_recommended: cagr_note = ( f" (~{hold_sim['cagr_pct']:.0f}% CAGR simulated)" if hold_sim is not None and hold_sim.get("cagr_pct") is not None diff --git a/frontend/src/components/signals/BacktestPanel.tsx b/frontend/src/components/signals/BacktestPanel.tsx index a05d09e..51f16eb 100644 --- a/frontend/src/components/signals/BacktestPanel.tsx +++ b/frontend/src/components/signals/BacktestPanel.tsx @@ -414,6 +414,8 @@ export function BacktestPanel() { Worst R Avg Hold Net R/d + Median Net R + Ex-Top-5% @@ -435,6 +437,8 @@ export function BacktestPanel() { {fmtR(row.worst_r)} {fmtDays(row.avg_hold_days)} {fmtRPerDay(row.net_r_per_day)} + {fmtR(row.median_net_r)} + {fmtR(row.net_avg_r_ex_top5)} ); })} diff --git a/frontend/src/lib/types.ts b/frontend/src/lib/types.ts index 99ac2df..b223089 100644 --- a/frontend/src/lib/types.ts +++ b/frontend/src/lib/types.ts @@ -259,6 +259,9 @@ export interface BacktestTimeExitRow { worst_r?: number | null; avg_hold_days?: number | null; net_r_per_day?: number | null; + median_net_r?: number | null; + profit_factor?: number | null; + net_avg_r_ex_top5?: number | null; } export interface BacktestPortfolioPolicy { diff --git a/tests/unit/test_backtest_service.py b/tests/unit/test_backtest_service.py index c47ca7c..0966703 100644 --- a/tests/unit/test_backtest_service.py +++ b/tests/unit/test_backtest_service.py @@ -143,6 +143,10 @@ class TestTimeExitBucket: # No stop_day on any candidate → every hold runs the full 5 days. assert b["avg_hold_days"] == 5.0 assert b["net_r_per_day"] == pytest.approx(0.28 / 5.0, abs=0.001) + # robustness on net rs [1.38, -1.02, 0.48] + assert b["median_net_r"] == pytest.approx(0.48, abs=0.001) + assert b["profit_factor"] == pytest.approx(1.86 / 1.02, abs=0.01) + assert b["net_avg_r_ex_top5"] == pytest.approx((0.48 - 1.02) / 2, abs=0.001) def test_missing_hold_skipped(self): b = bt._time_exit_bucket([{"time_r": {5: 1.0}}], 21) @@ -341,7 +345,7 @@ def test_build_recommendation_reads_the_report(): "overall_qualified": {"net_avg_r": 0.13, "net_avg_r_ex_top5": 0.05}, "time_exit_sweep": [ {"hold_days": 21, "net_avg_r": 0.38}, - {"hold_days": 30, "net_avg_r": 0.50}, + {"hold_days": 30, "net_avg_r": 0.50, "net_avg_r_ex_top5": 0.21}, ], "gate_ablation": [ {"variant": "all_floors", "total": 100, "hold_net_avg_r": 0.50}, @@ -374,7 +378,12 @@ def test_build_recommendation_reads_the_report(): assert "keep the NEUTRAL exclusion" in gate_texts assert "80" in by_topic["cutoff"][0] assert "beats" in by_topic["benchmark"][0] - assert any("not a handful of outliers" in t for t in by_topic["robustness"]) + # robustness is judged under the RECOMMENDED exit (the 30d hold), not the + # target model the recommendation advises abandoning + assert any( + "not a handful of outliers" in t and "under the recommended 30d hold" in t + for t in by_topic["robustness"] + ) def test_build_recommendation_flags_outlier_dependence():