feat: robustness stats + dynamic recommendation; retire settled report sections
Robustness (answers 'is the edge just outliers?'):
- _bucket_stats gains median_net_r, profit_factor, and net_avg_r_ex_top5
(expectancy with the top 5% of winners removed); shown as stat tiles.
- Portfolio sim gains per-calendar-year returns, shown in the sim table.
Dynamic recommendation ('What this backtest recommends' panel):
- _build_recommendation derives advice from the report's own numbers on
every run — exit policy (target vs best hold, with sim CAGRs), which
gate floors earn their keep (ablation Hold column), best momentum
cutoff, book-vs-SPY verdict, and an outlier-dependence warning when
the trimmed expectancy goes non-positive.
Retired (conclusions reached, tables removed from report + UI):
- Take-profit sweep (no interior optimum — fixed TP is the wrong tool
for momentum), trailing sweep (converged to the hold-to-horizon exit),
probability calibration (model is display-only by decision).
- _tp_primitives slimmed to _risk_and_stop_day; trailing machinery gone.
Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
@@ -75,116 +75,21 @@ class TestStopFillR:
|
||||
assert bt._stop_fill_r("short", 100.0, 105.0, _bar(110, 104, 108, open_=107)) == pytest.approx(-1.4)
|
||||
|
||||
|
||||
class TestTakeProfitPrimitives:
|
||||
def test_long_tp_reachable_before_stop(self):
|
||||
risk, stopped, mfe, close_pct, stop_day, _ = bt._tp_primitives("long", 100.0, 95.0, [_bar(109, 101, 108)], 30)
|
||||
class TestRiskAndStopDay:
|
||||
def test_no_stop(self):
|
||||
risk, stop_day = bt._risk_and_stop_day("long", 100.0, 95.0, [_bar(109, 101, 108)], 30)
|
||||
assert risk == pytest.approx(0.05)
|
||||
assert stopped is False
|
||||
assert mfe == pytest.approx(0.09)
|
||||
assert close_pct == pytest.approx(0.08)
|
||||
assert stop_day is None
|
||||
|
||||
def test_long_stop_zeroes_mfe(self):
|
||||
# Low pierces the stop on the only bar → loss, nothing banked before it.
|
||||
risk, stopped, mfe, close_pct, stop_day, stop_r = bt._tp_primitives("long", 100.0, 95.0, [_bar(101, 94, 96)], 30)
|
||||
assert stopped is True
|
||||
assert mfe == pytest.approx(0.0)
|
||||
assert close_pct == pytest.approx(-0.04)
|
||||
assert stop_day == 1
|
||||
assert stop_r == pytest.approx(-1.0)
|
||||
|
||||
def test_gap_through_stop_loses_more_than_1r(self):
|
||||
_, stopped, _, _, stop_day, stop_r = bt._tp_primitives(
|
||||
"long", 100.0, 95.0, [_bar(93, 90, 91, open_=92)], 30
|
||||
)
|
||||
assert stopped is True
|
||||
assert stop_day == 1
|
||||
assert stop_r == pytest.approx(-1.6) # filled at the 92 open, not the 95 stop
|
||||
|
||||
def test_long_drift_no_trigger(self):
|
||||
bars = [_bar(102, 99, 101), _bar(103, 100, 102)]
|
||||
risk, stopped, mfe, close_pct, _, _ = bt._tp_primitives("long", 100.0, 95.0, bars, 30)
|
||||
assert stopped is False
|
||||
assert mfe == pytest.approx(0.03)
|
||||
assert close_pct == pytest.approx(0.02)
|
||||
def test_stop_day_is_one_based(self):
|
||||
bars = [_bar(102, 99, 101), _bar(101, 94, 96)]
|
||||
risk, stop_day = bt._risk_and_stop_day("long", 100.0, 95.0, bars, 30)
|
||||
assert risk == pytest.approx(0.05)
|
||||
assert stop_day == 2
|
||||
|
||||
def test_short_direction(self):
|
||||
# short entry 100, stop 105; price falls → favourable = (entry - low)/entry
|
||||
risk, stopped, mfe, close_pct, _, _ = bt._tp_primitives("short", 100.0, 105.0, [_bar(101, 92, 93)], 30)
|
||||
assert risk == pytest.approx(0.05)
|
||||
assert stopped is False
|
||||
assert mfe == pytest.approx(0.08)
|
||||
assert close_pct == pytest.approx(0.07)
|
||||
|
||||
|
||||
class TestTakeProfitBucket:
|
||||
def test_bucket_mix(self):
|
||||
cands = [
|
||||
{"risk_pct": 0.05, "mfe_pct": 0.09, "tp_stopped": False, "tp_close_pct": 0.08}, # +1.6R win
|
||||
{"risk_pct": 0.05, "mfe_pct": 0.02, "tp_stopped": True, "tp_close_pct": -0.04}, # -1R stop
|
||||
{"risk_pct": 0.05, "mfe_pct": 0.03, "tp_stopped": False, "tp_close_pct": 0.01}, # +0.2R timeout
|
||||
]
|
||||
b = bt._take_profit_bucket(cands, 0.08)
|
||||
assert b["total"] == 3
|
||||
assert b["wins"] == 1
|
||||
assert b["hit_rate"] == pytest.approx(33.3, abs=0.1)
|
||||
assert b["total_r"] == pytest.approx(0.8, abs=0.01)
|
||||
assert b["avg_r"] == pytest.approx(0.267, abs=0.01)
|
||||
# net: minus a 0.04R round trip per candidate (risk_pct 0.05)
|
||||
assert b["net_total_r"] == pytest.approx(0.8 - 3 * _COST_R_005, abs=0.01)
|
||||
assert b["net_avg_r"] == pytest.approx((0.8 - 3 * _COST_R_005) / 3, abs=0.01)
|
||||
|
||||
def test_zero_risk_skipped(self):
|
||||
cands = [{"risk_pct": 0.0, "mfe_pct": 0.2, "tp_stopped": False, "tp_close_pct": 0.1}]
|
||||
b = bt._take_profit_bucket(cands, 0.08)
|
||||
assert b["total"] == 0
|
||||
assert b["avg_r"] is None
|
||||
|
||||
|
||||
class TestTrailingExits:
|
||||
def test_locks_gain_on_pullback(self):
|
||||
# Runs to 120, then a 10% trail (from peak 120 → 108) is pierced on the drop.
|
||||
res = bt._trailing_exits("long", 100.0, 90.0, (0.10,), [_bar(120, 110, 118), _bar(130, 100, 105)], 30)
|
||||
assert res[10] == pytest.approx(0.8) # (108-100)/100 / 0.10 risk
|
||||
|
||||
def test_initial_stop_caps_loss(self):
|
||||
# Trail (20%) is looser than the initial stop → initial stop governs = -1R.
|
||||
res = bt._trailing_exits("long", 100.0, 90.0, (0.20,), [_bar(101, 89, 90)], 30)
|
||||
assert res[20] == pytest.approx(-1.0)
|
||||
|
||||
def test_timeout_exits_at_close(self):
|
||||
res = bt._trailing_exits("long", 100.0, 90.0, (0.20,), [_bar(105, 98, 104), _bar(106, 100, 105)], 30)
|
||||
assert res[20] == pytest.approx(0.5) # close 105 → +5% / 10% risk
|
||||
|
||||
def test_multiple_widths_one_pass(self):
|
||||
# Tighter trail locks in more here (exit at 114 vs 108).
|
||||
res = bt._trailing_exits("long", 100.0, 90.0, (0.10, 0.05), [_bar(120, 110, 118), _bar(130, 100, 105)], 30)
|
||||
assert res[10] == pytest.approx(0.8)
|
||||
assert res[5] == pytest.approx(1.4)
|
||||
|
||||
def test_gap_through_stop_fills_at_open(self):
|
||||
# Initial stop 90 governs (20% trail from peak 100 is lower); the bar
|
||||
# opens at 85, below it → fill at the open.
|
||||
res = bt._trailing_exits("long", 100.0, 90.0, (0.20,), [_bar(88, 84, 86, open_=85)], 30)
|
||||
assert res[20] == pytest.approx(-1.5)
|
||||
|
||||
|
||||
class TestTrailingBucket:
|
||||
def test_bucket(self):
|
||||
cands = [
|
||||
{"trail_r": {5: 1.4, 10: 0.8}, "risk_pct": 0.10},
|
||||
{"trail_r": {5: -1.0, 10: -1.0}, "risk_pct": 0.10},
|
||||
{"trail_r": {5: 0.5, 10: 0.5}, "risk_pct": 0.10},
|
||||
]
|
||||
b = bt._trailing_bucket(cands, 5)
|
||||
assert b["total"] == 3
|
||||
assert b["wins"] == 2
|
||||
assert b["win_rate"] == pytest.approx(66.7, abs=0.1)
|
||||
assert b["total_r"] == pytest.approx(0.9, abs=0.01)
|
||||
assert b["avg_r"] == pytest.approx(0.3, abs=0.01)
|
||||
# net: 0.02R round trip per candidate (risk_pct 0.10)
|
||||
assert b["net_total_r"] == pytest.approx(0.9 - 3 * 0.02, abs=0.01)
|
||||
assert b["net_avg_r"] == pytest.approx(0.28, abs=0.01)
|
||||
_, stop_day = bt._risk_and_stop_day("short", 100.0, 105.0, [_bar(106, 101, 104)], 30)
|
||||
assert stop_day == 1
|
||||
|
||||
|
||||
class TestTimeExits:
|
||||
@@ -357,6 +262,9 @@ class TestSimulatePortfolio:
|
||||
assert sim["max_drawdown_pct"] == 0.0
|
||||
assert sim["cagr_pct"] is None # window far too short to annualize
|
||||
assert sim["spy_return_pct"] is None
|
||||
assert sim["yearly_returns"] == [
|
||||
{"year": 2025, "return_pct": pytest.approx(1.2, abs=0.05)}
|
||||
]
|
||||
|
||||
def test_target_policy_exits_at_target(self):
|
||||
closes = [100.0, 102.0, 104.0, 106.0, 108.0, 110.0]
|
||||
@@ -405,6 +313,11 @@ def test_bucket_stats_counts_and_expectancy():
|
||||
assert s["worst_r"] == -1.0
|
||||
assert s["avg_hold_days"] == 10.0
|
||||
assert s["net_r_per_day"] == pytest.approx((1.0 - _COST_R_005) / 10.0, abs=0.001)
|
||||
# robustness: net rs are [2.96, 1.96, -1.04, -0.04]
|
||||
assert s["median_net_r"] == pytest.approx(0.96, abs=0.001)
|
||||
assert s["profit_factor"] == pytest.approx(4.92 / 1.08, abs=0.01)
|
||||
# ex-top-5%: ceil(4 * 0.05) = 1 winner trimmed → mean of the remaining three
|
||||
assert s["net_avg_r_ex_top5"] == pytest.approx((1.96 - 1.04 - 0.04) / 3, abs=0.001)
|
||||
|
||||
|
||||
def test_bucket_stats_empty():
|
||||
@@ -423,18 +336,53 @@ def test_bucket_stats_no_risk_pct_means_no_cost():
|
||||
assert s["net_total_r"] == s["total_r"]
|
||||
|
||||
|
||||
def test_calibration_buckets():
|
||||
cands = [
|
||||
_cand(65, OUTCOME_TARGET_HIT, 2.0),
|
||||
_cand(62, OUTCOME_STOP_HIT, 2.0),
|
||||
_cand(15, OUTCOME_STOP_HIT, 2.0),
|
||||
]
|
||||
rows = bt._calibration(cands)
|
||||
by_bucket = {r["bucket"]: r for r in rows}
|
||||
assert by_bucket["60-80%"]["n"] == 2
|
||||
assert by_bucket["60-80%"]["realized_hit_rate"] == 50.0 # 1 of 2 hit
|
||||
assert by_bucket["0-20%"]["n"] == 1
|
||||
assert by_bucket["0-20%"]["realized_hit_rate"] == 0.0
|
||||
def test_build_recommendation_reads_the_report():
|
||||
report = {
|
||||
"overall_qualified": {"net_avg_r": 0.13, "net_avg_r_ex_top5": 0.05},
|
||||
"time_exit_sweep": [
|
||||
{"hold_days": 21, "net_avg_r": 0.38},
|
||||
{"hold_days": 30, "net_avg_r": 0.50},
|
||||
],
|
||||
"gate_ablation": [
|
||||
{"variant": "all_floors", "total": 100, "hold_net_avg_r": 0.50},
|
||||
{"variant": "no_confidence_floor", "total": 130, "hold_net_avg_r": 0.49},
|
||||
{"variant": "no_rr_floor", "total": 400, "hold_net_avg_r": 0.34},
|
||||
{"variant": "no_neutral_exclusion", "total": 120, "hold_net_avg_r": 0.46},
|
||||
],
|
||||
"sweep": [
|
||||
{"min_momentum_percentile": 80.0, "net_avg_r": 0.13, "total": 100},
|
||||
{"min_momentum_percentile": 60.0, "net_avg_r": 0.05, "total": 300},
|
||||
{"min_momentum_percentile": 0.0, "net_avg_r": -0.12, "total": 1000},
|
||||
],
|
||||
"portfolio_sim": {"policies": [
|
||||
{"policy": "target", "cagr_pct": 23.7, "total_return_pct": 134.8,
|
||||
"spy_return_pct": 95.9, "max_drawdown_pct": 20.7},
|
||||
{"policy": "hold", "cagr_pct": 31.9, "total_return_pct": 203.6,
|
||||
"spy_return_pct": 95.9, "max_drawdown_pct": 21.2},
|
||||
]},
|
||||
}
|
||||
rec = bt._build_recommendation(report)
|
||||
by_topic: dict[str, list[str]] = {}
|
||||
for item in rec["items"]:
|
||||
by_topic.setdefault(item["topic"], []).append(item["text"])
|
||||
|
||||
assert rec["headline"] is not None and "hold 30" in rec["headline"]
|
||||
assert any("hold 30 trading days" in t for t in by_topic["exit"])
|
||||
gate_texts = " | ".join(by_topic["gate"])
|
||||
assert "confidence floor adds nothing" in gate_texts
|
||||
assert "keep the R:R floor" in gate_texts
|
||||
assert "keep the NEUTRAL exclusion" in gate_texts
|
||||
assert "80" in by_topic["cutoff"][0]
|
||||
assert "beats" in by_topic["benchmark"][0]
|
||||
assert any("not a handful of outliers" in t for t in by_topic["robustness"])
|
||||
|
||||
|
||||
def test_build_recommendation_flags_outlier_dependence():
|
||||
rec = bt._build_recommendation({
|
||||
"overall_qualified": {"net_avg_r": 0.13, "net_avg_r_ex_top5": -0.02},
|
||||
})
|
||||
robustness = [i["text"] for i in rec["items"] if i["topic"] == "robustness"]
|
||||
assert robustness and "WARNING" in robustness[0]
|
||||
|
||||
|
||||
def test_window_setups_too_short_returns_empty():
|
||||
@@ -495,8 +443,8 @@ async def test_run_backtest_smoke(session):
|
||||
assert report["tickers"] == 1
|
||||
assert isinstance(report["candidates"], int)
|
||||
for key in (
|
||||
"overall_qualified", "overall_all", "by_direction", "calibration", "sweep",
|
||||
"gate_ablation", "time_exit_sweep",
|
||||
"overall_qualified", "overall_all", "by_direction", "sweep",
|
||||
"gate_ablation", "time_exit_sweep", "portfolio_sim", "recommendation",
|
||||
):
|
||||
assert key in report
|
||||
# the oscillating series should yield at least some resolved setups
|
||||
@@ -526,7 +474,3 @@ async def test_run_backtest_smoke(session):
|
||||
sweep = sorted(report["sweep"], key=lambda r: r["min_momentum_percentile"], reverse=True)
|
||||
counts = [r["total"] for r in sweep]
|
||||
assert counts == sorted(counts) # ascending as threshold descends
|
||||
# every calibration row is internally consistent
|
||||
for row in report["calibration"]:
|
||||
assert 0 <= row["realized_hit_rate"] <= 100
|
||||
assert row["n"] >= 1
|
||||
|
||||
Reference in New Issue
Block a user