fix: judge robustness under the recommended exit, not the abandoned one
Deploy / lint (push) Successful in 6s
Deploy / test (push) Successful in 57s
Deploy / deploy (push) Successful in 32s

The robustness warning was computed on the target-model distribution
while the same panel recommends the hold exit — internally inconsistent.
_robustness_stats (median, profit factor, ex-top-5% expectancy) is now
shared by _bucket_stats and _time_exit_bucket, the time-exit table shows
Median Net R and Ex-Top-5% per hold length, and _build_recommendation
reads the trimmed expectancy from the recommended exit's bucket (falling
back to the target model when no hold is recommended).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
2026-07-02 12:50:13 +02:00
parent 243e369e9a
commit 29a61cb2ca
4 changed files with 52 additions and 17 deletions
+34 -15
View File
@@ -375,14 +375,6 @@ def _bucket_stats(cands: list[dict]) -> dict:
holds = [c["hold_days"] for c in cands if c.get("hold_days")]
avg_hold = sum(holds) / len(holds) if holds else None
net_avg = sum(net_rs) / len(net_rs) if net_rs else None
# Robustness: does the edge depend on a handful of outliers? Median and
# profit factor describe the distribution; ex-top-5% is the expectancy with
# the biggest winners removed — if it stays positive, the edge isn't a
# lottery ticket.
gains = sum(r for r in net_rs if r > 0)
losses_abs = -sum(r for r in net_rs if r < 0)
trim_n = math.ceil(len(net_rs) * 0.05) if net_rs else 0
trimmed = sorted(net_rs, reverse=True)[trim_n:] if net_rs else []
return {
"total": len(cands),
"wins": wins,
@@ -400,7 +392,21 @@ def _bucket_stats(cands: list[dict]) -> dict:
"net_r_per_day": (
round(net_avg / avg_hold, 4) if net_avg is not None and avg_hold else None
),
"median_net_r": round(statistics.median(net_rs), 3) if net_rs else None,
**_robustness_stats(net_rs),
}
def _robustness_stats(net_rs: list[float]) -> dict:
"""Distribution-shape stats: the median (typical) trade, gross wins vs
losses, and the expectancy with the top 5% of winners removed — the direct
test of whether the edge depends on a handful of outliers."""
if not net_rs:
return {"median_net_r": None, "profit_factor": None, "net_avg_r_ex_top5": None}
gains = sum(r for r in net_rs if r > 0)
losses_abs = -sum(r for r in net_rs if r < 0)
trimmed = sorted(net_rs, reverse=True)[math.ceil(len(net_rs) * 0.05):]
return {
"median_net_r": round(statistics.median(net_rs), 3),
"profit_factor": round(gains / losses_abs, 2) if losses_abs > 0 else None,
"net_avg_r_ex_top5": (
round(sum(trimmed) / len(trimmed), 3) if trimmed else None
@@ -466,6 +472,7 @@ def _time_exit_bucket(cands: list[dict], hold_days: int) -> dict:
"net_r_per_day": (
round(net_avg / avg_hold, 4) if net_avg is not None and avg_hold else None
),
**_robustness_stats(net_rs),
}
@@ -1190,15 +1197,27 @@ def _build_recommendation(report: dict) -> dict:
),
})
# Robustness: does the edge survive without the biggest winners?
trimmed = q.get("net_avg_r_ex_top5")
# Robustness: does the edge survive without the biggest winners? Judged on
# the RECOMMENDED exit — outlier dependence under an exit we'd abandon
# would be the wrong warning.
hold_recommended = (
best_hold is not None and target_net is not None
and best_hold["net_avg_r"] > target_net + _EXIT_SWITCH_THRESHOLD
)
if hold_recommended and best_hold.get("net_avg_r_ex_top5") is not None:
trimmed = best_hold["net_avg_r_ex_top5"]
basis = f"under the recommended {best_hold['hold_days']}d hold"
else:
trimmed = q.get("net_avg_r_ex_top5")
basis = "under the S/R target exit"
if trimmed is not None:
if trimmed > 0:
items.append({
"topic": "robustness",
"text": (
f"Robustness: expectancy survives removing the top 5% of winners "
f"({trimmed:+.2f}R net/trade) — the edge is not a handful of outliers."
f"({trimmed:+.2f}R net/trade {basis}) — the edge is not a handful "
"of outliers."
),
})
else:
@@ -1206,13 +1225,13 @@ def _build_recommendation(report: dict) -> dict:
"topic": "robustness",
"text": (
f"Robustness WARNING: without the top 5% of winners the edge disappears "
f"({trimmed:+.2f}R net/trade) — outlier-dependent, treat the headline "
"expectancy with caution."
f"({trimmed:+.2f}R net/trade {basis}) — outlier-dependent, treat the "
"headline expectancy with caution."
),
})
headline = None
if best_hold is not None and target_net is not None and best_hold["net_avg_r"] > target_net + _EXIT_SWITCH_THRESHOLD:
if hold_recommended:
cagr_note = (
f" (~{hold_sim['cagr_pct']:.0f}% CAGR simulated)"
if hold_sim is not None and hold_sim.get("cagr_pct") is not None
@@ -414,6 +414,8 @@ export function BacktestPanel() {
<th className="px-4 py-2.5 text-right">Worst R</th>
<th className="px-4 py-2.5 text-right">Avg Hold</th>
<th className="px-4 py-2.5 text-right">Net R/d</th>
<th className="px-4 py-2.5 text-right">Median Net R</th>
<th className="px-4 py-2.5 text-right">Ex-Top-5%</th>
</tr>
</thead>
<tbody>
@@ -435,6 +437,8 @@ export function BacktestPanel() {
<td className="num px-4 py-2.5 text-right text-red-400">{fmtR(row.worst_r)}</td>
<td className="num px-4 py-2.5 text-right text-gray-400">{fmtDays(row.avg_hold_days)}</td>
<td className={`num px-4 py-2.5 text-right ${rColor(row.net_r_per_day ?? null)}`}>{fmtRPerDay(row.net_r_per_day)}</td>
<td className={`num px-4 py-2.5 text-right ${rColor(row.median_net_r ?? null)}`}>{fmtR(row.median_net_r)}</td>
<td className={`num px-4 py-2.5 text-right ${rColor(row.net_avg_r_ex_top5 ?? null)}`}>{fmtR(row.net_avg_r_ex_top5)}</td>
</tr>
);
})}
+3
View File
@@ -259,6 +259,9 @@ export interface BacktestTimeExitRow {
worst_r?: number | null;
avg_hold_days?: number | null;
net_r_per_day?: number | null;
median_net_r?: number | null;
profit_factor?: number | null;
net_avg_r_ex_top5?: number | null;
}
export interface BacktestPortfolioPolicy {
+11 -2
View File
@@ -143,6 +143,10 @@ class TestTimeExitBucket:
# No stop_day on any candidate → every hold runs the full 5 days.
assert b["avg_hold_days"] == 5.0
assert b["net_r_per_day"] == pytest.approx(0.28 / 5.0, abs=0.001)
# robustness on net rs [1.38, -1.02, 0.48]
assert b["median_net_r"] == pytest.approx(0.48, abs=0.001)
assert b["profit_factor"] == pytest.approx(1.86 / 1.02, abs=0.01)
assert b["net_avg_r_ex_top5"] == pytest.approx((0.48 - 1.02) / 2, abs=0.001)
def test_missing_hold_skipped(self):
b = bt._time_exit_bucket([{"time_r": {5: 1.0}}], 21)
@@ -341,7 +345,7 @@ def test_build_recommendation_reads_the_report():
"overall_qualified": {"net_avg_r": 0.13, "net_avg_r_ex_top5": 0.05},
"time_exit_sweep": [
{"hold_days": 21, "net_avg_r": 0.38},
{"hold_days": 30, "net_avg_r": 0.50},
{"hold_days": 30, "net_avg_r": 0.50, "net_avg_r_ex_top5": 0.21},
],
"gate_ablation": [
{"variant": "all_floors", "total": 100, "hold_net_avg_r": 0.50},
@@ -374,7 +378,12 @@ def test_build_recommendation_reads_the_report():
assert "keep the NEUTRAL exclusion" in gate_texts
assert "80" in by_topic["cutoff"][0]
assert "beats" in by_topic["benchmark"][0]
assert any("not a handful of outliers" in t for t in by_topic["robustness"])
# robustness is judged under the RECOMMENDED exit (the 30d hold), not the
# target model the recommendation advises abandoning
assert any(
"not a handful of outliers" in t and "under the recommended 30d hold" in t
for t in by_topic["robustness"]
)
def test_build_recommendation_flags_outlier_dependence():