feat: robustness stats + dynamic recommendation; retire settled report sections

Robustness (answers 'is the edge just outliers?'): - _bucket_stats gains median_net_r, profit_factor, and net_avg_r_ex_top5 (expectancy with the top 5% of winners removed); shown as stat tiles. - Portfolio sim gains per-calendar-year returns, shown in the sim table. Dynamic recommendation ('What this backtest recommends' panel): - _build_recommendation derives advice from the report's own numbers on every run — exit policy (target vs best hold, with sim CAGRs), which gate floors earn their keep (ablation Hold column), best momentum cutoff, book-vs-SPY verdict, and an outlier-dependence warning when the trimmed expectancy goes non-positive. Retired (conclusions reached, tables removed from report + UI): - Take-profit sweep (no interior optimum — fixed TP is the wrong tool for momentum), trailing sweep (converged to the hold-to-horizon exit), probability calibration (model is display-only by decision). - _tp_primitives slimmed to _risk_and_stop_day; trailing machinery gone. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-07-02 12:33:22 +02:00
parent 0f43e755f4
commit 243e369e9a
4 changed files with 359 additions and 503 deletions
@@ -3,13 +3,21 @@ OHLCV and measure how the CURRENT config would have performed.
 For each ticker we step through history (weekly), and at each as-of date D we
 rebuild the setup using only bars ≤ D (no lookahead), then walk the actual bars
-after D to record the realized outcome. Two reports come out:
+after D to record the realized outcome. The report contains:
-  - realized hit-rate / expectancy of qualified setups (and of all setups)
+  - hit-rate / expectancy of qualified setups vs the all-setups control group,
-  - a probability calibration curve: do "60% likely" targets hit ~60% of the time?
+    gross and net of costs, with robustness stats (median, profit factor,
    expectancy without the top winners)
  - the momentum-percentile sweep and the gate ablation (each floor removed in
    turn, graded under both the target and the hold-to-horizon exit)
  - the time-exit sweep (hold N days with the initial stop)
  - cross-sectional factor rank-IC ("signal edge")
  - a capital-constrained portfolio simulation (equity curve → CAGR, drawdown,
    Sharpe, SPY comparison)
  - a data-driven recommendation derived from this report's numbers
 Limitation: sentiment and fundamentals have no point-in-time history, so they're
-held neutral here — this calibrates the price/S-R/probability machinery only.
+held neutral here — this calibrates the price/S-R machinery only.
 """
 from __future__ import annotations
@@ -20,6 +28,7 @@ import logging
 import math
 import multiprocessing
 import os
 import statistics
 from collections import defaultdict
 from collections.abc import Callable
 from concurrent.futures import ProcessPoolExecutor
@@ -75,8 +84,6 @@ MIN_LOOKBACK = 60      # bars needed before D for indicators (EMA cross needs 51
 HORIZON = 30           # trading days to resolve an outcome (matches the evaluator)
 ATR_MULTIPLIER = 1.5
 _CAL_BUCKETS = [(0, 20), (20, 40), (40, 60), (60, 80), (80, 100.01)]
 # Cross-sectional signal evaluation (factor IC). Each candidate signal is a
 # point-in-time number computed from closes alone (sentiment/fundamentals have no
 # history here), sampled one as-of per ISO week, and graded by how its rank
@@ -231,102 +238,19 @@ def _stop_fill_r(direction: str, entry: float, stop: float, bar) -> float:
    return (entry - fill) / risk
-def _tp_primitives(
+def _risk_and_stop_day(
    direction: str, entry: float, stop: float, forward: list, horizon: int
-) -> tuple[float, bool, float, float, int | None, float]:
+) -> tuple[float, int | None]:
-    """Primitives for the take-profit exit model, from the bars after detection.
+    """``(risk_pct, stop_day)`` from the bars after detection: the 1R stop
-
+    distance as a fraction of entry, and the 1-based trading day the initial
-    Returns ``(risk_pct, stopped, mfe_pct, close_pct, stop_day, stop_r)``:
+    stop was first pierced within the horizon (None if never). Feeds the cost
-      - ``risk_pct``  fraction from entry to stop (the 1R distance)
+    conversion and the time-exit hold accounting."""
      - ``stopped``   whether the stop was hit within the horizon
      - ``mfe_pct``   best favourable excursion (fraction) reachable *before* the
                      stop — strictly before the stop bar, so a same-bar tp+stop
                      counts as a loss (matching the conservative target model);
                      over the whole horizon if the stop is never hit
      - ``close_pct`` directional return at the horizon-end close (the timeout exit)
      - ``stop_day``  1-based trading day the stop was pierced, None if never
      - ``stop_r``    realized R at the stop fill (≤ −1 when the bar gapped
                      through the stop — see _stop_fill_r); −1.0 when unused
    From these any fixed take-profit level can be scored without re-walking bars:
    tp reached before stop (``mfe_pct >= tp``) → +tp; else stop → ``stop_r``;
    else the horizon-close move.
    """
    long = direction == "long"
    risk_pct = abs(entry - stop) / entry if entry else 0.0
-    bars = forward[:horizon]
+    for i, r in enumerate(forward[:horizon]):
    if not bars:
        return risk_pct, False, 0.0, 0.0, None, -1.0
    mfe = 0.0
    stopped = False
    stop_day: int | None = None
    stop_r = -1.0
    for i, r in enumerate(bars):
        if (r.low <= stop) if long else (r.high >= stop):
-            stopped = True
+            return risk_pct, i + 1
-            stop_day = i + 1
+    return risk_pct, None
            stop_r = _stop_fill_r(direction, entry, stop, r)
            break
        fav = (r.high - entry) / entry if long else (entry - r.low) / entry
        if fav > mfe:
            mfe = fav
    close_pct = ((bars[-1].close - entry) / entry) * (1.0 if long else -1.0)
    return risk_pct, stopped, mfe, close_pct, stop_day, stop_r
 def _trailing_exits(
    direction: str, entry: float, init_stop: float, trail_fracs, forward: list, horizon: int
 ) -> dict[int, float]:
    """Realized R per trailing-stop width, in one pass over the post-entry bars.
    The stop ratchets up (never below the initial stop): ``max(init_stop,
    peak*(1-trail))`` for a long. Exit when a bar pierces the current stop (filled
    at the stop level), else at the horizon-end close. Each width is keyed by its
    integer percent (5 for 0.05). Conservative: the stop for a bar uses the peak
    through the *previous* bar (this bar's high is folded in only afterwards).
    R is relative to the initial risk (entry → init_stop).
    """
    long = direction == "long"
    risk = abs(entry - init_stop) / entry if entry else 0.0
    if risk <= 0:
        return {round(f * 100): 0.0 for f in trail_fracs}
    bars = forward[:horizon]
    if not bars:
        return {round(f * 100): 0.0 for f in trail_fracs}
    result: dict[int, float] = {}
    peak = entry
    active = list(trail_fracs)
    for r in bars:
        remaining = []
        for f in active:
            if long:
                stop_level = max(init_stop, peak * (1 - f))
                if r.low <= stop_level:
                    fill = min(stop_level, r.open)  # gap through fills at the open
                    result[round(f * 100)] = ((fill - entry) / entry) / risk
                    continue
            else:
                stop_level = min(init_stop, peak * (1 + f))
                if r.high >= stop_level:
                    fill = max(stop_level, r.open)
                    result[round(f * 100)] = ((entry - fill) / entry) / risk
                    continue
            remaining.append(f)
        active = remaining
        if not active:
            break
        if long:
            if r.high > peak:
                peak = r.high
        elif r.low < peak:
            peak = r.low
    last_close = bars[-1].close
    timeout_r = (((last_close - entry) / entry) if long else ((entry - last_close) / entry)) / risk
    for f in active:
        result[round(f * 100)] = timeout_r
    return result
 def _time_exits(
@@ -337,8 +261,8 @@ def _time_exits(
    The initial stop stays active (fill at the stop level → −1R); otherwise the
    trade exits at the day-N close (the last available close when history ends
    early). No target, no trailing — the classic momentum implementation: buy,
-    hold ~N days, re-rank. Same conservative bar logic as ``_tp_primitives``: a
+    hold ~N days, re-rank. Conservative bar logic: a bar that pierces the stop
-    bar that pierces the stop is a loss before that bar's close counts.
+    is a loss before that bar's close counts.
    """
    long = direction == "long"
    risk = abs(entry - stop) / entry if entry else 0.0
@@ -405,14 +329,9 @@ def _replay_ticker(symbol: str, records: list, config: dict, activation: dict) -
                )
            else:  # expired
                realized_r = 0.0
-            # Take-profit exit primitives (parallel to the target-vs-stop outcome
+            risk_pct, stop_day = _risk_and_stop_day(
            # above; aggregated separately into the take-profit sweep).
            risk_pct, tp_stopped, mfe_pct, tp_close_pct, stop_day, tp_stop_r = _tp_primitives(
                s["direction"], s["entry"], s["stop"], forward, HORIZON
            )
            trail_r = _trailing_exits(
                s["direction"], s["entry"], s["stop"], TRAIL_LEVELS, forward, HORIZON
            )
            time_r = _time_exits(
                s["direction"], s["entry"], s["stop"], forward, TIME_EXIT_DAYS
            )
@@ -441,11 +360,6 @@ def _replay_ticker(symbol: str, records: list, config: dict, activation: dict) -
                "hold_days": hold_days,
                "stop_day": stop_day,
                "risk_pct": risk_pct,
                "tp_stopped": tp_stopped,
                "tp_stop_r": tp_stop_r,
                "mfe_pct": mfe_pct,
                "tp_close_pct": tp_close_pct,
                "trail_r": trail_r,
                "time_r": time_r,
            })
    return candidates
@@ -461,6 +375,14 @@ def _bucket_stats(cands: list[dict]) -> dict:
    holds = [c["hold_days"] for c in cands if c.get("hold_days")]
    avg_hold = sum(holds) / len(holds) if holds else None
    net_avg = sum(net_rs) / len(net_rs) if net_rs else None
    # Robustness: does the edge depend on a handful of outliers? Median and
    # profit factor describe the distribution; ex-top-5% is the expectancy with
    # the biggest winners removed — if it stays positive, the edge isn't a
    # lottery ticket.
    gains = sum(r for r in net_rs if r > 0)
    losses_abs = -sum(r for r in net_rs if r < 0)
    trim_n = math.ceil(len(net_rs) * 0.05) if net_rs else 0
    trimmed = sorted(net_rs, reverse=True)[trim_n:] if net_rs else []
    return {
        "total": len(cands),
        "wins": wins,
@@ -478,17 +400,18 @@ def _bucket_stats(cands: list[dict]) -> dict:
        "net_r_per_day": (
            round(net_avg / avg_hold, 4) if net_avg is not None and avg_hold else None
        ),
        "median_net_r": round(statistics.median(net_rs), 3) if net_rs else None,
        "profit_factor": round(gains / losses_abs, 2) if losses_abs > 0 else None,
        "net_avg_r_ex_top5": (
            round(sum(trimmed) / len(trimmed), 3) if trimmed else None
        ),
    }
-# Fixed take-profit levels (fractions) swept for the take-profit exit model.
+# The fixed take-profit and trailing-stop sweeps were retired 2026-07: swept
-# Extended into the tail so the avg-R peak/plateau is visible (it's where letting
+# TPs never found an interior optimum (momentum's edge lives in the right tail)
-# winners run stops paying). Note: this model ignores the setup's S/R target —
+# and wide trails converged to the hold-to-horizon exit, so the time-exit sweep
-# it's a standalone fixed-% exit; exiting at the target is the target model.
+# is the exit-decision surface.
 TP_LEVELS = (0.04, 0.06, 0.08, 0.10, 0.12, 0.15, 0.20, 0.25, 0.30, 0.40, 0.50)
 # Trailing-stop widths (give-back from the peak) swept for the trailing exit model.
 TRAIL_LEVELS = (0.03, 0.05, 0.07, 0.10, 0.15, 0.20, 0.25, 0.30)
 # Hold-N-days exits (initial stop stays active, exit at the day-N close) — the
 # classic cross-sectional momentum implementation: buy, hold ~a month, re-rank.
@@ -507,65 +430,6 @@ def _cost_r(cand: dict) -> float:
    return (2.0 * COST_PER_SIDE) / risk if risk > 0 else 0.0
 def _take_profit_bucket(cands: list[dict], tp: float) -> dict:
    """Stats for a fixed take-profit exit at +``tp`` (fraction): bank +tp if it's
    reached before the stop, else −1R on a stop, else exit at the horizon close.
    Results are in R (gain% / risk%) so they're comparable to the target model.
    ``hit_rate`` here = share that reached +tp before the stop (the MFE CDF)."""
    rs: list[float] = []
    net_rs: list[float] = []
    wins = 0
    for c in cands:
        risk = c.get("risk_pct") or 0.0
        if risk <= 0:
            continue
        if c.get("mfe_pct", 0.0) >= tp:
            r = tp / risk
            wins += 1
        elif c.get("tp_stopped"):
            r = c.get("tp_stop_r", -1.0)  # gap-aware stop fill, ≤ −1R
        else:
            r = (c.get("tp_close_pct", 0.0)) / risk
        rs.append(r)
        net_rs.append(r - _cost_r(c))
    total = len(rs)
    return {
        "tp_pct": round(tp * 100, 1),
        "total": total,
        "wins": wins,
        "hit_rate": round(wins / total * 100, 1) if total else None,
        "avg_r": round(sum(rs) / total, 3) if total else None,
        "total_r": round(sum(rs), 2) if total else None,
        "net_avg_r": round(sum(net_rs) / total, 3) if total else None,
        "net_total_r": round(sum(net_rs), 2) if total else None,
    }
 def _trailing_bucket(cands: list[dict], trail_pct: int) -> dict:
    """Stats for a trailing-stop exit of width ``trail_pct`` (integer percent).
    Each candidate carries its realized R for this width in ``trail_r``; a "win"
    is simply an exit in profit (R > 0)."""
    pairs = [
        (c["trail_r"][trail_pct], _cost_r(c))
        for c in cands
        if c.get("trail_r", {}).get(trail_pct) is not None
    ]
    total = len(pairs)
    rs = [r for r, _ in pairs]
    net_rs = [r - cost for r, cost in pairs]
    wins = sum(1 for r in rs if r > 0)
    return {
        "trail_pct": trail_pct,
        "total": total,
        "wins": wins,
        "win_rate": round(wins / total * 100, 1) if total else None,
        "avg_r": round(sum(rs) / total, 3) if total else None,
        "total_r": round(sum(rs), 2) if total else None,
        "net_avg_r": round(sum(net_rs) / total, 3) if total else None,
        "net_total_r": round(sum(net_rs), 2) if total else None,
    }
 def _time_exit_bucket(cands: list[dict], hold_days: int) -> dict:
    """Stats for the hold-``hold_days`` exit: initial stop active, otherwise out
    at the day-N close. Each candidate carries its realized R per hold length in
@@ -605,23 +469,6 @@ def _time_exit_bucket(cands: list[dict], hold_days: int) -> dict:
    }
 def _calibration(cands: list[dict]) -> list[dict]:
    """Predicted target probability vs realized hit rate, per probability bucket."""
    rows: list[dict] = []
    for lo, hi in _CAL_BUCKETS:
        bucket = [c for c in cands if lo <= c["primary_prob"] < hi]
        if not bucket:
            continue
        hits = sum(1 for c in bucket if c["target_hit"])
        rows.append({
            "bucket": f"{int(lo)}-{int(min(hi, 100))}%",
            "n": len(bucket),
            "predicted_avg": round(sum(c["primary_prob"] for c in bucket) / len(bucket), 1),
            "realized_hit_rate": round(hits / len(bucket) * 100, 1),
        })
    return rows
 # ---------------------------------------------------------------------------
 # Cross-sectional signal evaluation (factor information-coefficient)
 # ---------------------------------------------------------------------------
@@ -1172,6 +1019,31 @@ def _simulate_portfolio(
        if var > 0:
            sharpe = mean / math.sqrt(var) * math.sqrt(252)
    # Per-calendar-year returns off the equity curve — shows whether every year
    # contributed or one exceptional stretch carried the result.
    yearly: list[dict] = []
    year_start_eq = curve[0][1]
    cur_year = date.fromordinal(curve[0][0]).year
    last_eq = curve[0][1]
    for o, eq in curve:
        y = date.fromordinal(o).year
        if y != cur_year:
            yearly.append({
                "year": cur_year,
                "return_pct": (
                    round((last_eq / year_start_eq - 1) * 100, 1) if year_start_eq > 0 else None
                ),
            })
            cur_year = y
            year_start_eq = last_eq
        last_eq = eq
    yearly.append({
        "year": cur_year,
        "return_pct": (
            round((last_eq / year_start_eq - 1) * 100, 1) if year_start_eq > 0 else None
        ),
    })
    pnls = [t["pnl"] for t in trades]
    wins = sum(1 for p in pnls if p > 0)
    spy_pct = None
@@ -1201,11 +1073,163 @@ def _simulate_portfolio(
        ),
        "skipped_book_full": skipped_full,
        "spy_return_pct": round(spy_pct, 1) if spy_pct is not None else None,
        "yearly_returns": yearly,
        "start_date": date.fromordinal(calendar[0]).isoformat(),
        "end_date": date.fromordinal(calendar[-1]).isoformat(),
    }
 # ---------------------------------------------------------------------------
 # Data-driven recommendation
 # ---------------------------------------------------------------------------
 # A floor whose removal costs less than this (R net per trade, under the hold
 # exit) is judged not to be pulling its weight.
 _FLOOR_KEEP_THRESHOLD = 0.02
 # The hold exit must beat the target exit by at least this much to be advised.
 _EXIT_SWITCH_THRESHOLD = 0.05
 def _build_recommendation(report: dict) -> dict:
    """Strategy advice derived from THIS report's numbers — recomputed every
    run, so if the data flips, the advice flips. Rules are deliberately simple
    and transparent; thresholds are module constants above."""
    items: list[dict] = []
    q = report.get("overall_qualified") or {}
    target_net = q.get("net_avg_r")
    # Exit policy: the production target/stop race vs the best fixed hold.
    time_rows = [r for r in report.get("time_exit_sweep") or [] if r.get("net_avg_r") is not None]
    best_hold = max(time_rows, key=lambda r: r["net_avg_r"], default=None)
    sim_rows = {
        p.get("policy"): p
        for p in (report.get("portfolio_sim") or {}).get("policies", [])
    }
    hold_sim = sim_rows.get("hold")
    if best_hold is not None and target_net is not None:
        if best_hold["net_avg_r"] > target_net + _EXIT_SWITCH_THRESHOLD:
            text = (
                f"Exit: hold {best_hold['hold_days']} trading days with the initial stop "
                f"({best_hold['net_avg_r']:+.2f}R net/trade vs {target_net:+.2f}R for the S/R target exit)."
            )
            target_sim = sim_rows.get("target")
            if (
                hold_sim is not None and target_sim is not None
                and hold_sim.get("cagr_pct") is not None and target_sim.get("cagr_pct") is not None
            ):
                text += (
                    f" The simulated book agrees: {hold_sim['cagr_pct']:+.1f}% vs "
                    f"{target_sim['cagr_pct']:+.1f}% CAGR at similar drawdown."
                )
            items.append({"topic": "exit", "text": text})
        else:
            items.append({
                "topic": "exit",
                "text": (
                    f"Exit: keep the S/R target exit ({target_net:+.2f}R net/trade) — "
                    "no fixed hold beats it by a meaningful margin."
                ),
            })
    # Gate floors, judged under the hold exit (the ablation's Hold column).
    ablation = {r["variant"]: r for r in report.get("gate_ablation") or []}
    base_row = ablation.get("all_floors")
    base_hold = (base_row or {}).get("hold_net_avg_r")
    floor_labels = {
        "no_confidence_floor": "confidence floor",
        "no_rr_floor": "R:R floor",
        "no_neutral_exclusion": "NEUTRAL exclusion",
    }
    if base_hold is not None:
        for variant, label in floor_labels.items():
            row = ablation.get(variant)
            if row is None or row.get("hold_net_avg_r") is None:
                continue
            delta = base_hold - row["hold_net_avg_r"]
            extra = row["total"] - base_row["total"]
            if delta <= _FLOOR_KEEP_THRESHOLD:
                items.append({
                    "topic": "gate",
                    "text": (
                        f"Gate: the {label} adds nothing — dropping it costs {delta:+.2f}R/trade "
                        f"and adds {extra} trades."
                    ),
                })
            else:
                items.append({
                    "topic": "gate",
                    "text": f"Gate: keep the {label} (worth {delta:+.2f}R/trade under the hold exit).",
                })
    # Momentum cutoff: best per-trade net among the active-gate sweep rows.
    sweep_rows = [
        r for r in report.get("sweep") or []
        if r.get("net_avg_r") is not None and (r.get("min_momentum_percentile") or 0) > 0
    ]
    if sweep_rows:
        best_cut = max(sweep_rows, key=lambda r: r["net_avg_r"])
        items.append({
            "topic": "cutoff",
            "text": (
                f"Momentum cutoff: {best_cut['min_momentum_percentile']:.0f} has the best "
                f"per-trade net ({best_cut['net_avg_r']:+.2f}R over {best_cut['total']} setups)."
            ),
        })
    # Book vs benchmark.
    book = hold_sim or sim_rows.get("target")
    if book is not None and book.get("spy_return_pct") is not None:
        edge = book["total_return_pct"] - book["spy_return_pct"]
        verdict = "beats" if edge > 0 else "LAGS"
        items.append({
            "topic": "benchmark",
            "text": (
                f"Book vs SPY: {verdict} buy-and-hold by {edge:+.1f} points "
                f"({book['total_return_pct']:+.1f}% vs {book['spy_return_pct']:+.1f}%), "
                f"max drawdown −{book['max_drawdown_pct']:.1f}%."
            ),
        })
    # Robustness: does the edge survive without the biggest winners?
    trimmed = q.get("net_avg_r_ex_top5")
    if trimmed is not None:
        if trimmed > 0:
            items.append({
                "topic": "robustness",
                "text": (
                    f"Robustness: expectancy survives removing the top 5% of winners "
                    f"({trimmed:+.2f}R net/trade) — the edge is not a handful of outliers."
                ),
            })
        else:
            items.append({
                "topic": "robustness",
                "text": (
                    f"Robustness WARNING: without the top 5% of winners the edge disappears "
                    f"({trimmed:+.2f}R net/trade) — outlier-dependent, treat the headline "
                    "expectancy with caution."
                ),
            })
    headline = None
    if best_hold is not None and target_net is not None and best_hold["net_avg_r"] > target_net + _EXIT_SWITCH_THRESHOLD:
        cagr_note = (
            f" (~{hold_sim['cagr_pct']:.0f}% CAGR simulated)"
            if hold_sim is not None and hold_sim.get("cagr_pct") is not None
            else ""
        )
        headline = (
            f"Trade the qualified list long-only; hold {best_hold['hold_days']} trading days "
            f"with the initial ATR stop{cagr_note}."
        )
    return {
        "headline": headline,
        "items": items,
        "note": "Derived from this report's numbers on every run — the advice flips if the data does.",
    }
 async def run_backtest(
    db: AsyncSession,
    progress_cb: Callable[[int, int, str], None] | None = None,
@@ -1346,7 +1370,7 @@ async def run_backtest(
    except Exception:
        logger.exception("Portfolio simulation failed")
-    return {
+    report = {
        "generated_at": datetime.now(timezone.utc).isoformat(),
        "tickers": total,
        "candidates": len(candidates),
@@ -1376,8 +1400,6 @@ async def run_backtest(
            "instead of the S/R target — the view that matters if the exit "
            "policy moves to a fixed hold."
        ),
        "take_profit_sweep": [_take_profit_bucket(qualified, tp) for tp in TP_LEVELS],
        "trailing_sweep": [_trailing_bucket(qualified, round(f * 100)) for f in TRAIL_LEVELS],
        "time_exit_sweep": [_time_exit_bucket(qualified, n) for n in TIME_EXIT_DAYS],
        "portfolio_sim": {
            "params": {
@@ -1401,7 +1423,6 @@ async def run_backtest(
                "same window. In-sample; no dividends."
            ),
        },
        "calibration": _calibration(candidates),
        "signal_eval": _signal_evaluation(collected),
        "signal_eval_note": (
            "Cross-sectional rank-IC of price-only signals vs the forward "
@@ -1421,6 +1442,8 @@ async def run_backtest(
            "~6 months ≈ one market regime — treat as directional, not gospel."
        ),
    }
    report["recommendation"] = _build_recommendation(report)
    return report
 async def run_and_store(
@@ -124,14 +124,6 @@ export function BacktestPanel() {
  const queryClient = useQueryClient();
  const toast = useToast();
  const bestTpAvgR =
    report?.take_profit_sweep && report.take_profit_sweep.length > 0
      ? Math.max(...report.take_profit_sweep.map((r) => netOrGross(r) ?? -Infinity))
      : null;
  const bestTrailAvgR =
    report?.trailing_sweep && report.trailing_sweep.length > 0
      ? Math.max(...report.trailing_sweep.map((r) => netOrGross(r) ?? -Infinity))
      : null;
  const bestTimeAvgR =
    report?.time_exit_sweep && report.time_exit_sweep.length > 0
      ? Math.max(...report.time_exit_sweep.map((r) => netOrGross(r) ?? -Infinity))
@@ -189,6 +181,30 @@ export function BacktestPanel() {
              )}
            </p>
            {report.recommendation && report.recommendation.items.length > 0 && (
              <div className="glass border border-blue-400/20 p-4">
                <p className="section-index">What this backtest recommends</p>
                {report.recommendation.headline && (
                  <p className="mt-1.5 text-sm font-semibold text-gray-100">
                    {report.recommendation.headline}
                  </p>
                )}
                <ul className="mt-2 space-y-1">
                  {report.recommendation.items.map((item) => (
                    <li
                      key={item.topic + item.text}
                      className={`text-xs ${item.text.includes('WARNING') || item.text.includes('LAGS') ? 'text-amber-400' : 'text-gray-400'}`}
                    >
                      {item.text}
                    </li>
                  ))}
                </ul>
                {report.recommendation.note && (
                  <p className="mt-2 text-[11px] text-gray-600">{report.recommendation.note}</p>
                )}
              </div>
            )}
            <div className="grid gap-3 sm:grid-cols-2 lg:grid-cols-4">
              <Stat
                label="Qualified Hit Rate"
@@ -213,6 +229,30 @@ export function BacktestPanel() {
                valueClass={rColor(report.overall_qualified.total_r)}
                sub="cumulative, risk-adjusted"
              />
              {report.overall_qualified.median_net_r != null && (
                <Stat
                  label="Median Net R"
                  value={fmtR(report.overall_qualified.median_net_r)}
                  valueClass={rColor(report.overall_qualified.median_net_r)}
                  sub="qualified · the typical trade"
                />
              )}
              {report.overall_qualified.profit_factor != null && (
                <Stat
                  label="Profit Factor"
                  value={report.overall_qualified.profit_factor.toFixed(2)}
                  valueClass={report.overall_qualified.profit_factor > 1 ? 'text-emerald-400' : 'text-red-400'}
                  sub="qualified · net wins / net losses"
                />
              )}
              {report.overall_qualified.net_avg_r_ex_top5 != null && (
                <Stat
                  label="Ex-Top-5% Net R"
                  value={fmtR(report.overall_qualified.net_avg_r_ex_top5)}
                  valueClass={rColor(report.overall_qualified.net_avg_r_ex_top5)}
                  sub="expectancy without the biggest winners"
                />
              )}
            </div>
            <div className="glass overflow-x-auto">
@@ -348,106 +388,6 @@ export function BacktestPanel() {
              </div>
            )}
            {report.take_profit_sweep && report.take_profit_sweep.length > 0 && (
              <div>
                <p className="mb-2 text-xs font-medium uppercase tracking-widest text-gray-500">
                  Take-profit exit (alternative to the target above)
                </p>
                <p className="mb-2 text-[11px] text-gray-500">
                  Models a realistic exit instead of waiting for the far S/R target: bank{' '}
                  <span className="text-gray-300">+X%</span> if price reaches it before the stop, else the
                  stop-fill loss (a gap through the stop fills at the open, so it can exceed −1R), else exit
                  at the {report.params.horizon_days}-day close. In R, so it compares to the
                  target model above. <span className="text-gray-300">Hit Rate = how often you'd have banked
                  +X%</span> (how far winners actually run) — no top-ticking, it's the level you'd really set.
                  The setup's own S/R target is <em>not</em> used here (exiting at that target is the model
                  above); this is a pure fixed-% exit. ★ = best net avg R.
                </p>
                <div className="glass overflow-x-auto">
                  <table className="w-full text-sm">
                    <thead>
                      <tr className="border-b border-white/[0.06] text-left text-xs uppercase tracking-wider text-gray-500">
                        <th className="px-4 py-2.5">Take-profit</th>
                        <th className="px-4 py-2.5 text-right">Setups</th>
                        <th className="px-4 py-2.5 text-right">Hit (banked)</th>
                        <th className="px-4 py-2.5 text-right">Hit Rate</th>
                        <th className="px-4 py-2.5 text-right">Avg R</th>
                        <th className="px-4 py-2.5 text-right">Net Avg R</th>
                        <th className="px-4 py-2.5 text-right">Total R</th>
                      </tr>
                    </thead>
                    <tbody>
                      {report.take_profit_sweep.map((row) => {
                        const best = netOrGross(row) != null && netOrGross(row) === bestTpAvgR;
                        return (
                          <tr key={row.tp_pct} className={`border-b border-white/[0.04] ${best ? 'bg-emerald-400/[0.06]' : ''}`}>
                            <td className="num px-4 py-2.5 text-gray-200">
                              {best && <span className="mr-1 text-emerald-300">★</span>}
                              +{row.tp_pct}%
                            </td>
                            <td className="num px-4 py-2.5 text-right text-gray-200">{row.total}</td>
                            <td className="num px-4 py-2.5 text-right text-emerald-400">{row.wins}</td>
                            <td className="num px-4 py-2.5 text-right text-gray-200">{fmtPct(row.hit_rate)}</td>
                            <td className={`num px-4 py-2.5 text-right ${rColor(row.avg_r)}`}>{fmtR(row.avg_r)}</td>
                            <td className={`num px-4 py-2.5 text-right font-semibold ${rColor(row.net_avg_r ?? null)}`}>{fmtR(row.net_avg_r ?? null)}</td>
                            <td className={`num px-4 py-2.5 text-right ${rColor(row.total_r)}`}>{fmtR(row.total_r)}</td>
                          </tr>
                        );
                      })}
                    </tbody>
                  </table>
                </div>
              </div>
            )}
            {report.trailing_sweep && report.trailing_sweep.length > 0 && (
              <div>
                <p className="mb-2 text-xs font-medium uppercase tracking-widest text-gray-500">
                  Trailing-stop exit
                </p>
                <p className="mb-2 text-[11px] text-gray-500">
                  Let it run, but exit when price gives back <span className="text-gray-300">X% from its
                  peak</span> (the stop only ratchets up, never below the initial stop). Captures the tail
                  without the fixed take-profit's all-or-nothing miss, and protects gains. In R vs the initial
                  risk. <span className="text-gray-300">Win Rate = share closed in profit.</span> ★ = best net avg R.
                </p>
                <div className="glass overflow-x-auto">
                  <table className="w-full text-sm">
                    <thead>
                      <tr className="border-b border-white/[0.06] text-left text-xs uppercase tracking-wider text-gray-500">
                        <th className="px-4 py-2.5">Trail</th>
                        <th className="px-4 py-2.5 text-right">Setups</th>
                        <th className="px-4 py-2.5 text-right">Profitable</th>
                        <th className="px-4 py-2.5 text-right">Win Rate</th>
                        <th className="px-4 py-2.5 text-right">Avg R</th>
                        <th className="px-4 py-2.5 text-right">Net Avg R</th>
                        <th className="px-4 py-2.5 text-right">Total R</th>
                      </tr>
                    </thead>
                    <tbody>
                      {report.trailing_sweep.map((row) => {
                        const best = netOrGross(row) != null && netOrGross(row) === bestTrailAvgR;
                        return (
                          <tr key={row.trail_pct} className={`border-b border-white/[0.04] ${best ? 'bg-emerald-400/[0.06]' : ''}`}>
                            <td className="num px-4 py-2.5 text-gray-200">
                              {best && <span className="mr-1 text-emerald-300">★</span>}
                              {row.trail_pct}%
                            </td>
                            <td className="num px-4 py-2.5 text-right text-gray-200">{row.total}</td>
                            <td className="num px-4 py-2.5 text-right text-emerald-400">{row.wins}</td>
                            <td className="num px-4 py-2.5 text-right text-gray-200">{fmtPct(row.win_rate)}</td>
                            <td className={`num px-4 py-2.5 text-right ${rColor(row.avg_r)}`}>{fmtR(row.avg_r)}</td>
                            <td className={`num px-4 py-2.5 text-right font-semibold ${rColor(row.net_avg_r ?? null)}`}>{fmtR(row.net_avg_r ?? null)}</td>
                            <td className={`num px-4 py-2.5 text-right ${rColor(row.total_r)}`}>{fmtR(row.total_r)}</td>
                          </tr>
                        );
                      })}
                    </tbody>
                  </table>
                </div>
              </div>
            )}
            {report.time_exit_sweep && report.time_exit_sweep.length > 0 && (
              <div>
                <p className="mb-2 text-xs font-medium uppercase tracking-widest text-gray-500">
@@ -543,6 +483,16 @@ export function BacktestPanel() {
                          ['Avg P&L / trade', (p) => fmtMoney(p.avg_trade_pnl), (p) => rColor(p.avg_trade_pnl)],
                          ['Best / worst trade', (p) => `${fmtR(p.best_trade_r)} / ${fmtR(p.worst_trade_r)}`, () => 'text-gray-300'],
                          ['Avg holding time', (p) => fmtDays(p.avg_hold_days), () => 'text-gray-300'],
                          [
                            'Per-year returns',
                            (p) =>
                              p.yearly_returns && p.yearly_returns.length > 0
                                ? p.yearly_returns
                                    .map((y) => `${y.year} ${fmtSignedPct(y.return_pct)}`)
                                    .join(' · ')
                                : '—',
                            () => 'text-gray-300',
                          ],
                          ['Entries skipped (book full)', (p) => String(p.skipped_book_full), () => 'text-gray-500'],
                        ] as [string, (p: BacktestPortfolioPolicy) => string, (p: BacktestPortfolioPolicy) => string][]
                      ).map(([label, fmt, color]) => (
@@ -561,47 +511,6 @@ export function BacktestPanel() {
              </div>
            )}
            <div>
              <p className="mb-2 text-xs font-medium uppercase tracking-widest text-gray-500">
                Probability calibration
              </p>
              <p className="mb-2 text-[11px] text-gray-500">
                Do targets we call “X% likely” actually hit that often? Realized below predicted =
                the model is over-confident.
              </p>
              {report.calibration.length === 0 ? (
                <Callout variant="empty">Not enough resolved setups to calibrate.</Callout>
              ) : (
                <div className="glass overflow-x-auto">
                  <table className="w-full text-sm">
                    <thead>
                      <tr className="border-b border-white/[0.06] text-left text-xs uppercase tracking-wider text-gray-500">
                        <th className="px-4 py-2.5">Predicted Bucket</th>
                        <th className="px-4 py-2.5 text-right">Setups</th>
                        <th className="px-4 py-2.5 text-right">Avg Predicted</th>
                        <th className="px-4 py-2.5 text-right">Realized Hit Rate</th>
                      </tr>
                    </thead>
                    <tbody>
                      {report.calibration.map((row) => {
                        const over = row.realized_hit_rate < row.predicted_avg;
                        return (
                          <tr key={row.bucket} className="border-b border-white/[0.04]">
                            <td className="px-4 py-2.5 text-gray-200">{row.bucket}</td>
                            <td className="num px-4 py-2.5 text-right text-gray-300">{row.n}</td>
                            <td className="num px-4 py-2.5 text-right text-gray-400">{row.predicted_avg.toFixed(0)}%</td>
                            <td className={`num px-4 py-2.5 text-right font-semibold ${over ? 'text-amber-400' : 'text-emerald-400'}`}>
                              {row.realized_hit_rate.toFixed(0)}%
                            </td>
                          </tr>
                        );
                      })}
                    </tbody>
                  </table>
                </div>
              )}
            </div>
            {report.signal_eval && report.signal_eval.length > 0 && (
              <div>
                <p className="mb-2 text-xs font-medium uppercase tracking-widest text-gray-500">
@@ -236,41 +236,16 @@ export interface BacktestBucket {
  worst_r?: number | null;
  avg_hold_days?: number | null;
  net_r_per_day?: number | null;
-}
+  // Robustness: distribution shape, and expectancy without the top winners.
-
+  median_net_r?: number | null;
-export interface BacktestCalibrationRow {
+  profit_factor?: number | null;
-  bucket: string;
+  net_avg_r_ex_top5?: number | null;
  n: number;
  predicted_avg: number;
  realized_hit_rate: number;
 }
 export interface BacktestSweepRow extends BacktestBucket {
  min_momentum_percentile: number;
 }
 export interface BacktestTakeProfitRow {
  tp_pct: number;
  total: number;
  wins: number;
  hit_rate: number | null;
  avg_r: number | null;
  total_r: number | null;
  net_avg_r?: number | null;
  net_total_r?: number | null;
 }
 export interface BacktestTrailingRow {
  trail_pct: number;
  total: number;
  wins: number;
  win_rate: number | null;
  avg_r: number | null;
  total_r: number | null;
  net_avg_r?: number | null;
  net_total_r?: number | null;
 }
 export interface BacktestTimeExitRow {
  hold_days: number;
  total: number;
@@ -304,10 +279,17 @@ export interface BacktestPortfolioPolicy {
  avg_hold_days: number | null;
  skipped_book_full: number;
  spy_return_pct: number | null;
  yearly_returns?: { year: number; return_pct: number | null }[];
  start_date: string;
  end_date: string;
 }
 export interface BacktestRecommendation {
  headline: string | null;
  items: { topic: string; text: string }[];
  note?: string;
 }
 export interface BacktestPortfolioSim {
  params: {
    starting_capital: number;
@@ -359,11 +341,9 @@ export interface BacktestReport {
  sweep: BacktestSweepRow[];
  gate_ablation?: BacktestGateAblationRow[];
  gate_ablation_note?: string;
  take_profit_sweep?: BacktestTakeProfitRow[];
  trailing_sweep?: BacktestTrailingRow[];
  time_exit_sweep?: BacktestTimeExitRow[];
  portfolio_sim?: BacktestPortfolioSim;
-  calibration: BacktestCalibrationRow[];
+  recommendation?: BacktestRecommendation;
  signal_eval?: BacktestSignalEvalRow[];
  signal_eval_note?: string;
  note: string;
@@ -75,116 +75,21 @@ class TestStopFillR:
        assert bt._stop_fill_r("short", 100.0, 105.0, _bar(110, 104, 108, open_=107)) == pytest.approx(-1.4)
-class TestTakeProfitPrimitives:
+class TestRiskAndStopDay:
-    def test_long_tp_reachable_before_stop(self):
+    def test_no_stop(self):
-        risk, stopped, mfe, close_pct, stop_day, _ = bt._tp_primitives("long", 100.0, 95.0, [_bar(109, 101, 108)], 30)
+        risk, stop_day = bt._risk_and_stop_day("long", 100.0, 95.0, [_bar(109, 101, 108)], 30)
        assert risk == pytest.approx(0.05)
        assert stopped is False
        assert mfe == pytest.approx(0.09)
        assert close_pct == pytest.approx(0.08)
        assert stop_day is None
-    def test_long_stop_zeroes_mfe(self):
+    def test_stop_day_is_one_based(self):
-        # Low pierces the stop on the only bar → loss, nothing banked before it.
+        bars = [_bar(102, 99, 101), _bar(101, 94, 96)]
-        risk, stopped, mfe, close_pct, stop_day, stop_r = bt._tp_primitives("long", 100.0, 95.0, [_bar(101, 94, 96)], 30)
+        risk, stop_day = bt._risk_and_stop_day("long", 100.0, 95.0, bars, 30)
-        assert stopped is True
+        assert risk == pytest.approx(0.05)
-        assert mfe == pytest.approx(0.0)
+        assert stop_day == 2
        assert close_pct == pytest.approx(-0.04)
        assert stop_day == 1
        assert stop_r == pytest.approx(-1.0)
    def test_gap_through_stop_loses_more_than_1r(self):
        _, stopped, _, _, stop_day, stop_r = bt._tp_primitives(
            "long", 100.0, 95.0, [_bar(93, 90, 91, open_=92)], 30
        )
        assert stopped is True
        assert stop_day == 1
        assert stop_r == pytest.approx(-1.6)  # filled at the 92 open, not the 95 stop
    def test_long_drift_no_trigger(self):
        bars = [_bar(102, 99, 101), _bar(103, 100, 102)]
        risk, stopped, mfe, close_pct, _, _ = bt._tp_primitives("long", 100.0, 95.0, bars, 30)
        assert stopped is False
        assert mfe == pytest.approx(0.03)
        assert close_pct == pytest.approx(0.02)
    def test_short_direction(self):
-        # short entry 100, stop 105; price falls → favourable = (entry - low)/entry
+        _, stop_day = bt._risk_and_stop_day("short", 100.0, 105.0, [_bar(106, 101, 104)], 30)
-        risk, stopped, mfe, close_pct, _, _ = bt._tp_primitives("short", 100.0, 105.0, [_bar(101, 92, 93)], 30)
+        assert stop_day == 1
        assert risk == pytest.approx(0.05)
        assert stopped is False
        assert mfe == pytest.approx(0.08)
        assert close_pct == pytest.approx(0.07)
 class TestTakeProfitBucket:
    def test_bucket_mix(self):
        cands = [
            {"risk_pct": 0.05, "mfe_pct": 0.09, "tp_stopped": False, "tp_close_pct": 0.08},  # +1.6R win
            {"risk_pct": 0.05, "mfe_pct": 0.02, "tp_stopped": True, "tp_close_pct": -0.04},  # -1R stop
            {"risk_pct": 0.05, "mfe_pct": 0.03, "tp_stopped": False, "tp_close_pct": 0.01},  # +0.2R timeout
        ]
        b = bt._take_profit_bucket(cands, 0.08)
        assert b["total"] == 3
        assert b["wins"] == 1
        assert b["hit_rate"] == pytest.approx(33.3, abs=0.1)
        assert b["total_r"] == pytest.approx(0.8, abs=0.01)
        assert b["avg_r"] == pytest.approx(0.267, abs=0.01)
        # net: minus a 0.04R round trip per candidate (risk_pct 0.05)
        assert b["net_total_r"] == pytest.approx(0.8 - 3 * _COST_R_005, abs=0.01)
        assert b["net_avg_r"] == pytest.approx((0.8 - 3 * _COST_R_005) / 3, abs=0.01)
    def test_zero_risk_skipped(self):
        cands = [{"risk_pct": 0.0, "mfe_pct": 0.2, "tp_stopped": False, "tp_close_pct": 0.1}]
        b = bt._take_profit_bucket(cands, 0.08)
        assert b["total"] == 0
        assert b["avg_r"] is None
 class TestTrailingExits:
    def test_locks_gain_on_pullback(self):
        # Runs to 120, then a 10% trail (from peak 120 → 108) is pierced on the drop.
        res = bt._trailing_exits("long", 100.0, 90.0, (0.10,), [_bar(120, 110, 118), _bar(130, 100, 105)], 30)
        assert res[10] == pytest.approx(0.8)  # (108-100)/100 / 0.10 risk
    def test_initial_stop_caps_loss(self):
        # Trail (20%) is looser than the initial stop → initial stop governs = -1R.
        res = bt._trailing_exits("long", 100.0, 90.0, (0.20,), [_bar(101, 89, 90)], 30)
        assert res[20] == pytest.approx(-1.0)
    def test_timeout_exits_at_close(self):
        res = bt._trailing_exits("long", 100.0, 90.0, (0.20,), [_bar(105, 98, 104), _bar(106, 100, 105)], 30)
        assert res[20] == pytest.approx(0.5)  # close 105 → +5% / 10% risk
    def test_multiple_widths_one_pass(self):
        # Tighter trail locks in more here (exit at 114 vs 108).
        res = bt._trailing_exits("long", 100.0, 90.0, (0.10, 0.05), [_bar(120, 110, 118), _bar(130, 100, 105)], 30)
        assert res[10] == pytest.approx(0.8)
        assert res[5] == pytest.approx(1.4)
    def test_gap_through_stop_fills_at_open(self):
        # Initial stop 90 governs (20% trail from peak 100 is lower); the bar
        # opens at 85, below it → fill at the open.
        res = bt._trailing_exits("long", 100.0, 90.0, (0.20,), [_bar(88, 84, 86, open_=85)], 30)
        assert res[20] == pytest.approx(-1.5)
 class TestTrailingBucket:
    def test_bucket(self):
        cands = [
            {"trail_r": {5: 1.4, 10: 0.8}, "risk_pct": 0.10},
            {"trail_r": {5: -1.0, 10: -1.0}, "risk_pct": 0.10},
            {"trail_r": {5: 0.5, 10: 0.5}, "risk_pct": 0.10},
        ]
        b = bt._trailing_bucket(cands, 5)
        assert b["total"] == 3
        assert b["wins"] == 2
        assert b["win_rate"] == pytest.approx(66.7, abs=0.1)
        assert b["total_r"] == pytest.approx(0.9, abs=0.01)
        assert b["avg_r"] == pytest.approx(0.3, abs=0.01)
        # net: 0.02R round trip per candidate (risk_pct 0.10)
        assert b["net_total_r"] == pytest.approx(0.9 - 3 * 0.02, abs=0.01)
        assert b["net_avg_r"] == pytest.approx(0.28, abs=0.01)
 class TestTimeExits:
@@ -357,6 +262,9 @@ class TestSimulatePortfolio:
        assert sim["max_drawdown_pct"] == 0.0
        assert sim["cagr_pct"] is None  # window far too short to annualize
        assert sim["spy_return_pct"] is None
        assert sim["yearly_returns"] == [
            {"year": 2025, "return_pct": pytest.approx(1.2, abs=0.05)}
        ]
    def test_target_policy_exits_at_target(self):
        closes = [100.0, 102.0, 104.0, 106.0, 108.0, 110.0]
@@ -405,6 +313,11 @@ def test_bucket_stats_counts_and_expectancy():
    assert s["worst_r"] == -1.0
    assert s["avg_hold_days"] == 10.0
    assert s["net_r_per_day"] == pytest.approx((1.0 - _COST_R_005) / 10.0, abs=0.001)
    # robustness: net rs are [2.96, 1.96, -1.04, -0.04]
    assert s["median_net_r"] == pytest.approx(0.96, abs=0.001)
    assert s["profit_factor"] == pytest.approx(4.92 / 1.08, abs=0.01)
    # ex-top-5%: ceil(4 * 0.05) = 1 winner trimmed → mean of the remaining three
    assert s["net_avg_r_ex_top5"] == pytest.approx((1.96 - 1.04 - 0.04) / 3, abs=0.001)
 def test_bucket_stats_empty():
@@ -423,18 +336,53 @@ def test_bucket_stats_no_risk_pct_means_no_cost():
    assert s["net_total_r"] == s["total_r"]
-def test_calibration_buckets():
+def test_build_recommendation_reads_the_report():
-    cands = [
+    report = {
-        _cand(65, OUTCOME_TARGET_HIT, 2.0),
+        "overall_qualified": {"net_avg_r": 0.13, "net_avg_r_ex_top5": 0.05},
-        _cand(62, OUTCOME_STOP_HIT, 2.0),
+        "time_exit_sweep": [
-        _cand(15, OUTCOME_STOP_HIT, 2.0),
+            {"hold_days": 21, "net_avg_r": 0.38},
-    ]
+            {"hold_days": 30, "net_avg_r": 0.50},
-    rows = bt._calibration(cands)
+        ],
-    by_bucket = {r["bucket"]: r for r in rows}
+        "gate_ablation": [
-    assert by_bucket["60-80%"]["n"] == 2
+            {"variant": "all_floors", "total": 100, "hold_net_avg_r": 0.50},
-    assert by_bucket["60-80%"]["realized_hit_rate"] == 50.0  # 1 of 2 hit
+            {"variant": "no_confidence_floor", "total": 130, "hold_net_avg_r": 0.49},
-    assert by_bucket["0-20%"]["n"] == 1
+            {"variant": "no_rr_floor", "total": 400, "hold_net_avg_r": 0.34},
-    assert by_bucket["0-20%"]["realized_hit_rate"] == 0.0
+            {"variant": "no_neutral_exclusion", "total": 120, "hold_net_avg_r": 0.46},
        ],
        "sweep": [
            {"min_momentum_percentile": 80.0, "net_avg_r": 0.13, "total": 100},
            {"min_momentum_percentile": 60.0, "net_avg_r": 0.05, "total": 300},
            {"min_momentum_percentile": 0.0, "net_avg_r": -0.12, "total": 1000},
        ],
        "portfolio_sim": {"policies": [
            {"policy": "target", "cagr_pct": 23.7, "total_return_pct": 134.8,
             "spy_return_pct": 95.9, "max_drawdown_pct": 20.7},
            {"policy": "hold", "cagr_pct": 31.9, "total_return_pct": 203.6,
             "spy_return_pct": 95.9, "max_drawdown_pct": 21.2},
        ]},
    }
    rec = bt._build_recommendation(report)
    by_topic: dict[str, list[str]] = {}
    for item in rec["items"]:
        by_topic.setdefault(item["topic"], []).append(item["text"])
    assert rec["headline"] is not None and "hold 30" in rec["headline"]
    assert any("hold 30 trading days" in t for t in by_topic["exit"])
    gate_texts = " | ".join(by_topic["gate"])
    assert "confidence floor adds nothing" in gate_texts
    assert "keep the R:R floor" in gate_texts
    assert "keep the NEUTRAL exclusion" in gate_texts
    assert "80" in by_topic["cutoff"][0]
    assert "beats" in by_topic["benchmark"][0]
    assert any("not a handful of outliers" in t for t in by_topic["robustness"])
 def test_build_recommendation_flags_outlier_dependence():
    rec = bt._build_recommendation({
        "overall_qualified": {"net_avg_r": 0.13, "net_avg_r_ex_top5": -0.02},
    })
    robustness = [i["text"] for i in rec["items"] if i["topic"] == "robustness"]
    assert robustness and "WARNING" in robustness[0]
 def test_window_setups_too_short_returns_empty():
@@ -495,8 +443,8 @@ async def test_run_backtest_smoke(session):
    assert report["tickers"] == 1
    assert isinstance(report["candidates"], int)
    for key in (
-        "overall_qualified", "overall_all", "by_direction", "calibration", "sweep",
+        "overall_qualified", "overall_all", "by_direction", "sweep",
-        "gate_ablation", "time_exit_sweep",
+        "gate_ablation", "time_exit_sweep", "portfolio_sim", "recommendation",
    ):
        assert key in report
    # the oscillating series should yield at least some resolved setups
@@ -526,7 +474,3 @@ async def test_run_backtest_smoke(session):
    sweep = sorted(report["sweep"], key=lambda r: r["min_momentum_percentile"], reverse=True)
    counts = [r["total"] for r in sweep]
    assert counts == sorted(counts)  # ascending as threshold descends
    # every calibration row is internally consistent
    for row in report["calibration"]:
        assert 0 <= row["realized_hit_rate"] <= 100
        assert row["n"] >= 1