signal-platform/tests/unit/test_backtest_service.py

"""Tests for the historical backtest harness."""

from __future__ import annotations

import math
from datetime import date, timedelta
from types import SimpleNamespace

import pytest

from app.models.ohlcv import OHLCVRecord
from app.models.ticker import Ticker
from app.services import backtest_service as bt
from app.services.outcome_service import (
    OUTCOME_EXPIRED,
    OUTCOME_STOP_HIT,
    OUTCOME_TARGET_HIT,
)
from tests.conftest import _test_session_factory  # type: ignore


@pytest.fixture
async def session():
    async with _test_session_factory() as s:
        yield s


def _cand(
    prob: float,
    outcome: str,
    rr: float,
    qualified: bool = True,
    direction: str = "long",
    risk_pct: float = 0.05,
    hold_days: int = 10,
) -> dict:
    target_hit = outcome == OUTCOME_TARGET_HIT
    realized = rr if target_hit else (0.0 if outcome == OUTCOME_EXPIRED else -1.0)
    return {
        "primary_prob": prob,
        "outcome": outcome,
        "target_hit": target_hit,
        "rr": rr,
        "realized_r": realized,
        "qualified": qualified,
        "direction": direction,
        "risk_pct": risk_pct,
        "hold_days": hold_days,
    }


# Round-trip cost in R for the default _cand risk_pct: 2 * 0.001 / 0.05 = 0.04R.
_COST_R_005 = 2 * bt.COST_PER_SIDE / 0.05


def _bar(high: float, low: float, close: float, open_: float | None = None) -> SimpleNamespace:
    """Synthetic daily bar. ``open`` defaults to the high so a stop is pierced
    intraday (fill at the stop level); pass an explicit open beyond the stop to
    model a gap through it."""
    return SimpleNamespace(
        high=high, low=low, close=close, open=open_ if open_ is not None else high
    )


class TestStopFillR:
    def test_intraday_fill_at_stop(self):
        assert bt._stop_fill_r("long", 100.0, 95.0, _bar(101, 94, 96)) == pytest.approx(-1.0)

    def test_gap_fill_at_open(self):
        # Opens at 92, below the 95 stop → filled at the open, worse than −1R.
        assert bt._stop_fill_r("long", 100.0, 95.0, _bar(93, 90, 91, open_=92)) == pytest.approx(-1.6)

    def test_short_gap_fill_at_open(self):
        # Short stop 105; opens at 107 above it → fill 107.
        assert bt._stop_fill_r("short", 100.0, 105.0, _bar(110, 104, 108, open_=107)) == pytest.approx(-1.4)


class TestTakeProfitPrimitives:
    def test_long_tp_reachable_before_stop(self):
        risk, stopped, mfe, close_pct, stop_day, _ = bt._tp_primitives("long", 100.0, 95.0, [_bar(109, 101, 108)], 30)
        assert risk == pytest.approx(0.05)
        assert stopped is False
        assert mfe == pytest.approx(0.09)
        assert close_pct == pytest.approx(0.08)
        assert stop_day is None

    def test_long_stop_zeroes_mfe(self):
        # Low pierces the stop on the only bar → loss, nothing banked before it.
        risk, stopped, mfe, close_pct, stop_day, stop_r = bt._tp_primitives("long", 100.0, 95.0, [_bar(101, 94, 96)], 30)
        assert stopped is True
        assert mfe == pytest.approx(0.0)
        assert close_pct == pytest.approx(-0.04)
        assert stop_day == 1
        assert stop_r == pytest.approx(-1.0)

    def test_gap_through_stop_loses_more_than_1r(self):
        _, stopped, _, _, stop_day, stop_r = bt._tp_primitives(
            "long", 100.0, 95.0, [_bar(93, 90, 91, open_=92)], 30
        )
        assert stopped is True
        assert stop_day == 1
        assert stop_r == pytest.approx(-1.6)  # filled at the 92 open, not the 95 stop

    def test_long_drift_no_trigger(self):
        bars = [_bar(102, 99, 101), _bar(103, 100, 102)]
        risk, stopped, mfe, close_pct, _, _ = bt._tp_primitives("long", 100.0, 95.0, bars, 30)
        assert stopped is False
        assert mfe == pytest.approx(0.03)
        assert close_pct == pytest.approx(0.02)

    def test_short_direction(self):
        # short entry 100, stop 105; price falls → favourable = (entry - low)/entry
        risk, stopped, mfe, close_pct, _, _ = bt._tp_primitives("short", 100.0, 105.0, [_bar(101, 92, 93)], 30)
        assert risk == pytest.approx(0.05)
        assert stopped is False
        assert mfe == pytest.approx(0.08)
        assert close_pct == pytest.approx(0.07)


class TestTakeProfitBucket:
    def test_bucket_mix(self):
        cands = [
            {"risk_pct": 0.05, "mfe_pct": 0.09, "tp_stopped": False, "tp_close_pct": 0.08},  # +1.6R win
            {"risk_pct": 0.05, "mfe_pct": 0.02, "tp_stopped": True, "tp_close_pct": -0.04},  # -1R stop
            {"risk_pct": 0.05, "mfe_pct": 0.03, "tp_stopped": False, "tp_close_pct": 0.01},  # +0.2R timeout
        ]
        b = bt._take_profit_bucket(cands, 0.08)
        assert b["total"] == 3
        assert b["wins"] == 1
        assert b["hit_rate"] == pytest.approx(33.3, abs=0.1)
        assert b["total_r"] == pytest.approx(0.8, abs=0.01)
        assert b["avg_r"] == pytest.approx(0.267, abs=0.01)
        # net: minus a 0.04R round trip per candidate (risk_pct 0.05)
        assert b["net_total_r"] == pytest.approx(0.8 - 3 * _COST_R_005, abs=0.01)
        assert b["net_avg_r"] == pytest.approx((0.8 - 3 * _COST_R_005) / 3, abs=0.01)

    def test_zero_risk_skipped(self):
        cands = [{"risk_pct": 0.0, "mfe_pct": 0.2, "tp_stopped": False, "tp_close_pct": 0.1}]
        b = bt._take_profit_bucket(cands, 0.08)
        assert b["total"] == 0
        assert b["avg_r"] is None


class TestTrailingExits:
    def test_locks_gain_on_pullback(self):
        # Runs to 120, then a 10% trail (from peak 120 → 108) is pierced on the drop.
        res = bt._trailing_exits("long", 100.0, 90.0, (0.10,), [_bar(120, 110, 118), _bar(130, 100, 105)], 30)
        assert res[10] == pytest.approx(0.8)  # (108-100)/100 / 0.10 risk

    def test_initial_stop_caps_loss(self):
        # Trail (20%) is looser than the initial stop → initial stop governs = -1R.
        res = bt._trailing_exits("long", 100.0, 90.0, (0.20,), [_bar(101, 89, 90)], 30)
        assert res[20] == pytest.approx(-1.0)

    def test_timeout_exits_at_close(self):
        res = bt._trailing_exits("long", 100.0, 90.0, (0.20,), [_bar(105, 98, 104), _bar(106, 100, 105)], 30)
        assert res[20] == pytest.approx(0.5)  # close 105 → +5% / 10% risk

    def test_multiple_widths_one_pass(self):
        # Tighter trail locks in more here (exit at 114 vs 108).
        res = bt._trailing_exits("long", 100.0, 90.0, (0.10, 0.05), [_bar(120, 110, 118), _bar(130, 100, 105)], 30)
        assert res[10] == pytest.approx(0.8)
        assert res[5] == pytest.approx(1.4)

    def test_gap_through_stop_fills_at_open(self):
        # Initial stop 90 governs (20% trail from peak 100 is lower); the bar
        # opens at 85, below it → fill at the open.
        res = bt._trailing_exits("long", 100.0, 90.0, (0.20,), [_bar(88, 84, 86, open_=85)], 30)
        assert res[20] == pytest.approx(-1.5)


class TestTrailingBucket:
    def test_bucket(self):
        cands = [
            {"trail_r": {5: 1.4, 10: 0.8}, "risk_pct": 0.10},
            {"trail_r": {5: -1.0, 10: -1.0}, "risk_pct": 0.10},
            {"trail_r": {5: 0.5, 10: 0.5}, "risk_pct": 0.10},
        ]
        b = bt._trailing_bucket(cands, 5)
        assert b["total"] == 3
        assert b["wins"] == 2
        assert b["win_rate"] == pytest.approx(66.7, abs=0.1)
        assert b["total_r"] == pytest.approx(0.9, abs=0.01)
        assert b["avg_r"] == pytest.approx(0.3, abs=0.01)
        # net: 0.02R round trip per candidate (risk_pct 0.10)
        assert b["net_total_r"] == pytest.approx(0.9 - 3 * 0.02, abs=0.01)
        assert b["net_avg_r"] == pytest.approx(0.28, abs=0.01)


class TestTimeExits:
    def test_long_exits_at_horizon_close(self):
        bars = [_bar(103, 99, 102), _bar(105, 101, 104), _bar(107, 103, 106)]
        res = bt._time_exits("long", 100.0, 95.0, bars, (2, 5))
        assert res[2] == pytest.approx(0.8)   # close 104 → +4% / 5% risk
        assert res[5] == pytest.approx(1.2)   # only 3 bars → last close 106

    def test_stop_on_first_bar_loses_everywhere(self):
        res = bt._time_exits("long", 100.0, 95.0, [_bar(101, 94, 96), _bar(105, 101, 104)], (1, 5))
        assert res[1] == pytest.approx(-1.0)
        assert res[5] == pytest.approx(-1.0)

    def test_stop_after_short_horizon_only_hits_long_hold(self):
        # Day-2 close banked by the 2-day hold; the stop on day 3 only hits n=5.
        bars = [_bar(103, 99, 102), _bar(104, 100, 103), _bar(101, 94, 95)]
        res = bt._time_exits("long", 100.0, 95.0, bars, (2, 5))
        assert res[2] == pytest.approx(0.6)   # close 103 → +3% / 5% risk
        assert res[5] == pytest.approx(-1.0)

    def test_short_direction(self):
        res = bt._time_exits("short", 100.0, 105.0, [_bar(101, 95, 96)], (1,))
        assert res[1] == pytest.approx(0.8)   # close 96 → +4% / 5% risk

    def test_zero_risk_returns_zero(self):
        res = bt._time_exits("long", 100.0, 100.0, [_bar(103, 99, 102)], (5,))
        assert res[5] == 0.0

    def test_gap_through_stop_fills_at_open(self):
        res = bt._time_exits("long", 100.0, 95.0, [_bar(93, 90, 91, open_=92)], (5,))
        assert res[5] == pytest.approx(-1.6)


class TestTimeExitBucket:
    def test_bucket(self):
        cands = [
            {"time_r": {5: 1.4, 21: 0.8}, "risk_pct": 0.10},
            {"time_r": {5: -1.0, 21: -1.0}, "risk_pct": 0.10},
            {"time_r": {5: 0.5, 21: 0.5}, "risk_pct": 0.10},
        ]
        b = bt._time_exit_bucket(cands, 5)
        assert b["hold_days"] == 5
        assert b["total"] == 3
        assert b["wins"] == 2
        assert b["win_rate"] == pytest.approx(66.7, abs=0.1)
        assert b["avg_r"] == pytest.approx(0.3, abs=0.01)
        assert b["net_avg_r"] == pytest.approx(0.28, abs=0.01)
        assert b["best_r"] == pytest.approx(1.4)
        assert b["worst_r"] == pytest.approx(-1.0)
        # No stop_day on any candidate → every hold runs the full 5 days.
        assert b["avg_hold_days"] == 5.0
        assert b["net_r_per_day"] == pytest.approx(0.28 / 5.0, abs=0.001)

    def test_missing_hold_skipped(self):
        b = bt._time_exit_bucket([{"time_r": {5: 1.0}}], 21)
        assert b["total"] == 0
        assert b["avg_r"] is None


def _acand(
    rr: float = 2.0,
    conf: float = 60.0,
    action: str = "LONG_MODERATE",
    mp: float | None = 90.0,
    direction: str = "long",
) -> dict:
    """Ablation candidate: meets_core mirrors the default floors (min_rr 1.2,
    min_confidence 55, exclude_neutral on)."""
    meets = rr >= 1.2 and conf >= 55.0 and action != "NEUTRAL"
    return {
        "rr": rr,
        "confidence": conf,
        "action": action,
        "momentum_percentile": mp,
        "direction": direction,
        "meets_core": meets,
        "risk_level": "Low",
        "target_hit": True,
        "outcome": OUTCOME_TARGET_HIT,
        "realized_r": rr,
        "risk_pct": 0.05,
        "time_r": {d: 0.5 for d in bt.TIME_EXIT_DAYS},
    }


class TestGateAblation:
    ACTIVATION = {
        "min_rr": 1.2,
        "min_confidence": 55.0,
        "exclude_neutral": True,
        "require_high_conviction": False,
        "exclude_conflicts": False,
    }

    def test_variant_counts(self):
        cands = [
            _acand(),                          # clears everything
            _acand(conf=40.0),                 # fails confidence floor
            _acand(rr=1.0),                    # fails R:R floor
            _acand(action="NEUTRAL"),          # fails NEUTRAL exclusion
            _acand(mp=50.0),                   # fails the momentum cutoff
            _acand(direction="short", mp=95.0),  # short — gated out
        ]
        rows = {r["variant"]: r for r in bt._gate_ablation(cands, self.ACTIVATION, 80.0)}
        assert rows["all_floors"]["total"] == 1
        assert rows["no_confidence_floor"]["total"] == 2
        assert rows["no_rr_floor"]["total"] == 2
        assert rows["no_neutral_exclusion"]["total"] == 2
        assert rows["momentum_only"]["total"] == 4
        assert rows["all_floors"]["net_avg_r"] is not None
        # Every variant is also graded under the hold-to-horizon exit.
        assert rows["all_floors"]["hold_days"] == max(bt.TIME_EXIT_DAYS)
        assert rows["all_floors"]["hold_avg_r"] == pytest.approx(0.5)
        assert rows["all_floors"]["hold_net_avg_r"] is not None
        assert rows["momentum_only"]["hold_total_r"] == pytest.approx(4 * 0.5, abs=0.01)

    def test_threshold_zero_disables_momentum_gate(self):
        # Floors only: the short and the low-momentum long both pass all_floors.
        cands = [_acand(mp=50.0), _acand(direction="short", mp=None)]
        rows = {r["variant"]: r for r in bt._gate_ablation(cands, self.ACTIVATION, 0.0)}
        assert rows["all_floors"]["total"] == 2


def _sim_prices(start_ord: int, closes: list[float]) -> tuple:
    """Column arrays for consecutive daily bars: open = close (no gaps),
    high/low = close ± 1."""
    ords = list(range(start_ord, start_ord + len(closes)))
    return (
        ords,
        list(closes),
        [c + 1.0 for c in closes],
        [c - 1.0 for c in closes],
        list(closes),
        [1_000_000] * len(closes),
    )


def _sim_cand(
    sym: str, day_ord: int, entry: float, stop: float, target: float, mp: float = 90.0
) -> dict:
    return {
        "qualified": True,
        "direction": "long",
        "symbol": sym,
        "date": date.fromordinal(day_ord).isoformat(),
        "entry": entry,
        "stop": stop,
        "target": target,
        "momentum_percentile": mp,
    }


class TestSimulatePortfolio:
    ORD = date(2025, 1, 6).toordinal()

    def test_hold_policy_accounting(self):
        closes = [100.0, 102.0, 104.0, 106.0, 108.0, 110.0]
        prices = {"AAA": _sim_prices(self.ORD, closes)}
        cand = _sim_cand("AAA", self.ORD, entry=100.0, stop=95.0, target=130.0)
        sim = bt._simulate_portfolio([cand], prices, None, "hold", 3)
        assert sim is not None
        assert sim["trades"] == 1
        # 20 shares (1% risk / $5 stop distance), exit at the day-3 close 106:
        # pnl = 2120 − 2000 − 2.00 entry cost − 2.12 exit cost = 115.88
        assert sim["final_equity"] == pytest.approx(10_115.88, abs=0.01)
        assert sim["win_rate"] == 100.0
        assert sim["best_trade_r"] == pytest.approx(1.2)
        assert sim["avg_hold_days"] == 3.0
        assert sim["max_drawdown_pct"] == 0.0
        assert sim["cagr_pct"] is None  # window far too short to annualize
        assert sim["spy_return_pct"] is None

    def test_target_policy_exits_at_target(self):
        closes = [100.0, 102.0, 104.0, 106.0, 108.0, 110.0]
        prices = {"AAA": _sim_prices(self.ORD, closes)}
        cand = _sim_cand("AAA", self.ORD, entry=100.0, stop=95.0, target=105.0)
        sim = bt._simulate_portfolio([cand], prices, None, "target", 30)
        assert sim is not None
        assert sim["trades"] == 1
        assert sim["best_trade_r"] == pytest.approx(1.0)  # filled exactly at 105

    def test_stop_gap_fills_at_open(self):
        # Day-1 bar gaps to a 90 open, below the 95 stop → fill at the open.
        ords = list(range(self.ORD, self.ORD + 2))
        prices = {"AAA": (ords, [100.0, 90.0], [101.0, 92.0], [99.0, 88.0], [100.0, 91.0], [1, 1])}
        cand = _sim_cand("AAA", self.ORD, entry=100.0, stop=95.0, target=120.0)
        sim = bt._simulate_portfolio([cand], prices, None, "hold", 30)
        assert sim is not None
        assert sim["trades"] == 1
        assert sim["worst_trade_r"] == pytest.approx(-2.0)  # (90 − 100) / 5

    def test_nothing_qualified_returns_none(self):
        assert bt._simulate_portfolio([], {}, None, "hold", 30) is None


def test_bucket_stats_counts_and_expectancy():
    cands = [
        _cand(70, OUTCOME_TARGET_HIT, 3.0),   # +3R win
        _cand(60, OUTCOME_TARGET_HIT, 2.0),   # +2R win
        _cand(40, OUTCOME_STOP_HIT, 3.0),     # -1R loss
        _cand(30, OUTCOME_EXPIRED, 3.0),      # 0R expired
    ]
    s = bt._bucket_stats(cands)
    assert s["total"] == 4
    assert s["wins"] == 2
    assert s["losses"] == 1
    assert s["expired"] == 1
    # hit rate is over decided (wins+losses) only
    assert s["hit_rate"] == round(2 / 3 * 100, 1)
    # avg R = (3 + 2 - 1 + 0) / 4 = 1.0
    assert s["avg_r"] == 1.0
    assert s["total_r"] == 4.0
    # net = gross minus a 0.04R round trip per candidate (risk_pct 0.05)
    assert s["net_avg_r"] == pytest.approx(1.0 - _COST_R_005, abs=0.001)
    assert s["net_total_r"] == pytest.approx(4.0 - 4 * _COST_R_005, abs=0.01)
    assert s["best_r"] == 3.0
    assert s["worst_r"] == -1.0
    assert s["avg_hold_days"] == 10.0
    assert s["net_r_per_day"] == pytest.approx((1.0 - _COST_R_005) / 10.0, abs=0.001)


def test_bucket_stats_empty():
    s = bt._bucket_stats([])
    assert s["total"] == 0
    assert s["hit_rate"] is None
    assert s["avg_r"] is None
    assert s["net_avg_r"] is None


def test_bucket_stats_no_risk_pct_means_no_cost():
    c = _cand(50, OUTCOME_TARGET_HIT, 2.0)
    del c["risk_pct"]
    s = bt._bucket_stats([c])
    assert s["net_avg_r"] == s["avg_r"]
    assert s["net_total_r"] == s["total_r"]


def test_calibration_buckets():
    cands = [
        _cand(65, OUTCOME_TARGET_HIT, 2.0),
        _cand(62, OUTCOME_STOP_HIT, 2.0),
        _cand(15, OUTCOME_STOP_HIT, 2.0),
    ]
    rows = bt._calibration(cands)
    by_bucket = {r["bucket"]: r for r in rows}
    assert by_bucket["60-80%"]["n"] == 2
    assert by_bucket["60-80%"]["realized_hit_rate"] == 50.0  # 1 of 2 hit
    assert by_bucket["0-20%"]["n"] == 1
    assert by_bucket["0-20%"]["realized_hit_rate"] == 0.0


def test_window_setups_too_short_returns_empty():
    assert bt._window_setups([], {}, {}) == []


def test_replay_ticker_candidates_carry_gate_fields():
    """The ablation recomputes floors from candidate fields — a candidate missing
    action/risk_level silently zeroes the ablation rows (July 2026 regression)."""
    from app.services.admin_service import ACTIVATION_DEFAULTS
    from app.services.recommendation_service import DEFAULT_RECOMMENDATION_CONFIG

    base = date(2025, 1, 1)
    bars = []
    for i in range(160):
        close = 100.0 + 8.0 * math.sin(i / 6.0)
        bars.append(SimpleNamespace(
            date=base + timedelta(days=i),
            open=close,
            high=close + 1.5,
            low=close - 1.5,
            close=close,
            volume=1_000_000 + (i % 5) * 1000,
        ))
    cands = bt._replay_ticker(
        "OSC", bars, dict(DEFAULT_RECOMMENDATION_CONFIG), dict(ACTIVATION_DEFAULTS)
    )
    assert cands, "expected the oscillating series to produce candidates"
    for c in cands:
        assert c.get("action") is not None
        assert "risk_level" in c


async def _seed_oscillating_ticker(session, symbol: str, n: int = 160) -> None:
    t = Ticker(symbol=symbol)
    session.add(t)
    await session.flush()
    base = date(2025, 1, 1)
    for i in range(n):
        close = 100.0 + 8.0 * math.sin(i / 6.0)
        session.add(OHLCVRecord(
            ticker_id=t.id,
            date=base + timedelta(days=i),
            open=close,
            high=close + 1.5,
            low=close - 1.5,
            close=close,
            volume=1_000_000 + (i % 5) * 1000,
        ))
    await session.commit()


async def test_run_backtest_smoke(session):
    await _seed_oscillating_ticker(session, "OSC")
    report = await bt.run_backtest(session)

    # well-formed report
    assert report["tickers"] == 1
    assert isinstance(report["candidates"], int)
    for key in (
        "overall_qualified", "overall_all", "by_direction", "calibration", "sweep",
        "gate_ablation", "time_exit_sweep",
    ):
        assert key in report
    # the oscillating series should yield at least some resolved setups
    assert report["candidates"] >= 1

    # cost assumption is reported, and every bucket carries net numbers
    assert report["params"]["cost_per_side_pct"] == pytest.approx(bt.COST_PER_SIDE * 100)
    assert "net_avg_r" in report["overall_all"]

    # ablation baseline reproduces the qualified set exactly, and every row
    # carries the hold-to-horizon grading alongside the target model
    ablation = {r["variant"]: r for r in report["gate_ablation"]}
    assert ablation["all_floors"]["total"] == report["overall_qualified"]["total"]
    for row in report["gate_ablation"]:
        assert "hold_net_avg_r" in row

    # time-exit sweep covers the configured hold lengths
    assert [r["hold_days"] for r in report["time_exit_sweep"]] == list(bt.TIME_EXIT_DAYS)

    # portfolio simulation section is always present (policies may be empty
    # when nothing qualifies)
    assert "portfolio_sim" in report
    assert isinstance(report["portfolio_sim"]["policies"], list)
    assert report["portfolio_sim"]["params"]["max_positions"] == bt.SIM_MAX_POSITIONS

    # sweep: lowering the momentum-percentile cutoff can only add qualifiers
    sweep = sorted(report["sweep"], key=lambda r: r["min_momentum_percentile"], reverse=True)
    counts = [r["total"] for r in sweep]
    assert counts == sorted(counts)  # ascending as threshold descends
    # every calibration row is internally consistent
    for row in report["calibration"]:
        assert 0 <= row["realized_hit_rate"] <= 100
        assert row["n"] >= 1