fix: judge robustness under the recommended exit, not the abandoned one

The robustness warning was computed on the target-model distribution while the same panel recommends the hold exit — internally inconsistent. _robustness_stats (median, profit factor, ex-top-5% expectancy) is now shared by _bucket_stats and _time_exit_bucket, the time-exit table shows Median Net R and Ex-Top-5% per hold length, and _build_recommendation reads the trimmed expectancy from the recommended exit's bucket (falling back to the target model when no hold is recommended). Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-07-02 12:50:13 +02:00
parent 243e369e9a
commit 29a61cb2ca
4 changed files with 52 additions and 17 deletions
@@ -375,14 +375,6 @@ def _bucket_stats(cands: list[dict]) -> dict:
    holds = [c["hold_days"] for c in cands if c.get("hold_days")]
    avg_hold = sum(holds) / len(holds) if holds else None
    net_avg = sum(net_rs) / len(net_rs) if net_rs else None
-    # Robustness: does the edge depend on a handful of outliers? Median and
-    # profit factor describe the distribution; ex-top-5% is the expectancy with
-    # the biggest winners removed — if it stays positive, the edge isn't a
-    # lottery ticket.
-    gains = sum(r for r in net_rs if r > 0)
-    losses_abs = -sum(r for r in net_rs if r < 0)
-    trim_n = math.ceil(len(net_rs) * 0.05) if net_rs else 0
-    trimmed = sorted(net_rs, reverse=True)[trim_n:] if net_rs else []
    return {
        "total": len(cands),
        "wins": wins,
@@ -400,7 +392,21 @@ def _bucket_stats(cands: list[dict]) -> dict:
        "net_r_per_day": (
            round(net_avg / avg_hold, 4) if net_avg is not None and avg_hold else None
        ),
-        "median_net_r": round(statistics.median(net_rs), 3) if net_rs else None,
+        **_robustness_stats(net_rs),
+    }
+
+
+def _robustness_stats(net_rs: list[float]) -> dict:
+    """Distribution-shape stats: the median (typical) trade, gross wins vs
+    losses, and the expectancy with the top 5% of winners removed — the direct
+    test of whether the edge depends on a handful of outliers."""
+    if not net_rs:
+        return {"median_net_r": None, "profit_factor": None, "net_avg_r_ex_top5": None}
+    gains = sum(r for r in net_rs if r > 0)
+    losses_abs = -sum(r for r in net_rs if r < 0)
+    trimmed = sorted(net_rs, reverse=True)[math.ceil(len(net_rs) * 0.05):]
+    return {
+        "median_net_r": round(statistics.median(net_rs), 3),
        "profit_factor": round(gains / losses_abs, 2) if losses_abs > 0 else None,
        "net_avg_r_ex_top5": (
            round(sum(trimmed) / len(trimmed), 3) if trimmed else None
@@ -466,6 +472,7 @@ def _time_exit_bucket(cands: list[dict], hold_days: int) -> dict:
        "net_r_per_day": (
            round(net_avg / avg_hold, 4) if net_avg is not None and avg_hold else None
        ),
+        **_robustness_stats(net_rs),
    }


@@ -1190,15 +1197,27 @@ def _build_recommendation(report: dict) -> dict:
            ),
        })

-    # Robustness: does the edge survive without the biggest winners?
-    trimmed = q.get("net_avg_r_ex_top5")
+    # Robustness: does the edge survive without the biggest winners? Judged on
+    # the RECOMMENDED exit — outlier dependence under an exit we'd abandon
+    # would be the wrong warning.
+    hold_recommended = (
+        best_hold is not None and target_net is not None
+        and best_hold["net_avg_r"] > target_net + _EXIT_SWITCH_THRESHOLD
+    )
+    if hold_recommended and best_hold.get("net_avg_r_ex_top5") is not None:
+        trimmed = best_hold["net_avg_r_ex_top5"]
+        basis = f"under the recommended {best_hold['hold_days']}d hold"
+    else:
+        trimmed = q.get("net_avg_r_ex_top5")
+        basis = "under the S/R target exit"
    if trimmed is not None:
        if trimmed > 0:
            items.append({
                "topic": "robustness",
                "text": (
                    f"Robustness: expectancy survives removing the top 5% of winners "
-                    f"({trimmed:+.2f}R net/trade) — the edge is not a handful of outliers."
+                    f"({trimmed:+.2f}R net/trade {basis}) — the edge is not a handful "
+                    "of outliers."
                ),
            })
        else:
@@ -1206,13 +1225,13 @@ def _build_recommendation(report: dict) -> dict:
                "topic": "robustness",
                "text": (
                    f"Robustness WARNING: without the top 5% of winners the edge disappears "
-                    f"({trimmed:+.2f}R net/trade) — outlier-dependent, treat the headline "
-                    "expectancy with caution."
+                    f"({trimmed:+.2f}R net/trade {basis}) — outlier-dependent, treat the "
+                    "headline expectancy with caution."
                ),
            })

    headline = None
-    if best_hold is not None and target_net is not None and best_hold["net_avg_r"] > target_net + _EXIT_SWITCH_THRESHOLD:
+    if hold_recommended:
        cagr_note = (
            f" (~{hold_sim['cagr_pct']:.0f}% CAGR simulated)"
            if hold_sim is not None and hold_sim.get("cagr_pct") is not None
@@ -414,6 +414,8 @@ export function BacktestPanel() {
                        <th className="px-4 py-2.5 text-right">Worst R</th>
                        <th className="px-4 py-2.5 text-right">Avg Hold</th>
                        <th className="px-4 py-2.5 text-right">Net R/d</th>
+                        <th className="px-4 py-2.5 text-right">Median Net R</th>
+                        <th className="px-4 py-2.5 text-right">Ex-Top-5%</th>
                      </tr>
                    </thead>
                    <tbody>
@@ -435,6 +437,8 @@ export function BacktestPanel() {
                            <td className="num px-4 py-2.5 text-right text-red-400">{fmtR(row.worst_r)}</td>
                            <td className="num px-4 py-2.5 text-right text-gray-400">{fmtDays(row.avg_hold_days)}</td>
                            <td className={`num px-4 py-2.5 text-right ${rColor(row.net_r_per_day ?? null)}`}>{fmtRPerDay(row.net_r_per_day)}</td>
+                            <td className={`num px-4 py-2.5 text-right ${rColor(row.median_net_r ?? null)}`}>{fmtR(row.median_net_r)}</td>
+                            <td className={`num px-4 py-2.5 text-right ${rColor(row.net_avg_r_ex_top5 ?? null)}`}>{fmtR(row.net_avg_r_ex_top5)}</td>
                          </tr>
                        );
                      })}
@@ -259,6 +259,9 @@ export interface BacktestTimeExitRow {
  worst_r?: number | null;
  avg_hold_days?: number | null;
  net_r_per_day?: number | null;
+  median_net_r?: number | null;
+  profit_factor?: number | null;
+  net_avg_r_ex_top5?: number | null;
 }

 export interface BacktestPortfolioPolicy {
@@ -143,6 +143,10 @@ class TestTimeExitBucket:
        # No stop_day on any candidate → every hold runs the full 5 days.
        assert b["avg_hold_days"] == 5.0
        assert b["net_r_per_day"] == pytest.approx(0.28 / 5.0, abs=0.001)
+        # robustness on net rs [1.38, -1.02, 0.48]
+        assert b["median_net_r"] == pytest.approx(0.48, abs=0.001)
+        assert b["profit_factor"] == pytest.approx(1.86 / 1.02, abs=0.01)
+        assert b["net_avg_r_ex_top5"] == pytest.approx((0.48 - 1.02) / 2, abs=0.001)

    def test_missing_hold_skipped(self):
        b = bt._time_exit_bucket([{"time_r": {5: 1.0}}], 21)
@@ -341,7 +345,7 @@ def test_build_recommendation_reads_the_report():
        "overall_qualified": {"net_avg_r": 0.13, "net_avg_r_ex_top5": 0.05},
        "time_exit_sweep": [
            {"hold_days": 21, "net_avg_r": 0.38},
-            {"hold_days": 30, "net_avg_r": 0.50},
+            {"hold_days": 30, "net_avg_r": 0.50, "net_avg_r_ex_top5": 0.21},
        ],
        "gate_ablation": [
            {"variant": "all_floors", "total": 100, "hold_net_avg_r": 0.50},
@@ -374,7 +378,12 @@ def test_build_recommendation_reads_the_report():
    assert "keep the NEUTRAL exclusion" in gate_texts
    assert "80" in by_topic["cutoff"][0]
    assert "beats" in by_topic["benchmark"][0]
-    assert any("not a handful of outliers" in t for t in by_topic["robustness"])
+    # robustness is judged under the RECOMMENDED exit (the 30d hold), not the
+    # target model the recommendation advises abandoning
+    assert any(
+        "not a handful of outliers" in t and "under the recommended 30d hold" in t
+        for t in by_topic["robustness"]
+    )


 def test_build_recommendation_flags_outlier_dependence():