fix: only count matured setups in the live track record

The outcome stats were dominated by quick stop-outs: near stops resolve as losses within days while far targets take weeks, so a young sample (mostly pending, 0 expired) skewed sharply negative (e.g. 13.8% hit / -0.46R vs the backtest's 35.8% / +0.18R) — a maturation artifact, not a real result. get_performance_stats now counts only setups whose full ~30-day window has elapsed (_MATURITY_DAYS), so winners had as long as losers (unbiased, and comparable to the backtest). A new `maturing` count reports the younger setups held back. The Track Record UI relabels "Evaluated" -> "Matured", shows the maturing count, and explains the window in the empty state + methodology note. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-28 13:41:48 +02:00
parent 8bcbbfcfd0
commit 7e9a6cd7ec
4 changed files with 62 additions and 10 deletions
@@ -16,7 +16,7 @@ from __future__ import annotations

 import logging
 from dataclasses import dataclass
-from datetime import date, datetime, timezone
+from datetime import date, datetime, timedelta, timezone

 from sqlalchemy import select
 from sqlalchemy.ext.asyncio import AsyncSession
@@ -34,6 +34,13 @@ OUTCOME_EXPIRED = "expired"

 DEFAULT_MAX_BARS = 30

+# A setup's outcome is only unbiased once its full evaluation window has elapsed:
+# until then, near stops resolve as losses within days while far targets are still
+# pending, so a young sample skews sharply negative. Only count setups detected at
+# least this many CALENDAR days ago (~max_bars trading days, ×1.5 to cover
+# weekends/holidays). Younger setups are reported separately as "maturing".
+_MATURITY_DAYS = int(DEFAULT_MAX_BARS * 1.5)
+
 # Confidence buckets for the performance breakdown
 _CONFIDENCE_BUCKETS = [
    ("<50%", 0.0, 50.0),
@@ -183,7 +190,12 @@ async def get_performance_stats(
    db: AsyncSession,
    config: dict | None = None,
 ) -> dict:
-    """Aggregate outcome statistics over all evaluated trade setups.
+    """Aggregate outcome statistics over the *matured* evaluated trade setups.
+
+    Only setups whose full evaluation window has elapsed (see ``_MATURITY_DAYS``)
+    are counted, so the headline isn't dominated by quick stop-outs while slower
+    winners are still in flight. ``maturing`` reports how many are excluded for
+    being too young.

    avg_r is the expectancy per trade in R-multiples (win = +rr_ratio,
    loss = -1R, expired = 0R). A positive avg_r means the signals have
@@ -197,13 +209,23 @@ async def get_performance_stats(
    result = await db.execute(
        select(TradeSetup).where(TradeSetup.actual_outcome.is_not(None))
    )
-    evaluated = list(result.scalars().all())
+    evaluated_all = list(result.scalars().all())
+
+    # Matured cohort only — see _MATURITY_DAYS. Setups whose window hasn't fully
+    # elapsed are excluded so quick stop-outs can't drag the headline negative
+    # while their slower-to-resolve winners are still in flight.
+    cutoff_date = (datetime.now(timezone.utc) - timedelta(days=_MATURITY_DAYS)).date()
+    evaluated = [s for s in evaluated_all if s.detected_at.date() <= cutoff_date]

    pending_result = await db.execute(
        select(TradeSetup.id).where(TradeSetup.actual_outcome.is_(None))
    )
    pending_count = len(pending_result.scalars().all())

+    # Still inside their measurement window (excluded above so they can't bias the
+    # stats): young setups that already resolved + everything still pending.
+    maturing_count = (len(evaluated_all) - len(evaluated)) + pending_count
+
    if config is not None:
        qualified = [s for s in evaluated if setup_qualifies(s, config)]
    else:
@@ -229,6 +251,7 @@ async def get_performance_stats(
    return {
        "overall": _bucket_stats(qualified),
        "pending": pending_count,
+        "maturing": maturing_count,
        "by_direction": {k: _bucket_stats(v) for k, v in sorted(by_direction.items())},
        "by_action": {k: _bucket_stats(v) for k, v in sorted(by_action.items())},
        "by_confidence": {