From 7e9a6cd7ece1ced5b737171419f0d305c3fccde7 Mon Sep 17 00:00:00 2001
From: Dennis Thiessen <dennis@thiessen.io>
Date: Sun, 28 Jun 2026 13:41:48 +0200
Subject: [PATCH] fix: only count matured setups in the live track record
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The outcome stats were dominated by quick stop-outs: near stops resolve as losses
within days while far targets take weeks, so a young sample (mostly pending,
0 expired) skewed sharply negative (e.g. 13.8% hit / -0.46R vs the backtest's
35.8% / +0.18R) — a maturation artifact, not a real result.

get_performance_stats now counts only setups whose full ~30-day window has
elapsed (_MATURITY_DAYS), so winners had as long as losers (unbiased, and
comparable to the backtest). A new `maturing` count reports the younger setups
held back. The Track Record UI relabels "Evaluated" -> "Matured", shows the
maturing count, and explains the window in the empty state + methodology note.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 app/services/outcome_service.py               | 29 +++++++++++++++++--
 .../components/signals/TrackRecordPanel.tsx   | 19 +++++++-----
 frontend/src/lib/types.ts                     |  1 +
 tests/unit/test_outcome_service.py            | 23 +++++++++++++++
 4 files changed, 62 insertions(+), 10 deletions(-)

diff --git a/app/services/outcome_service.py b/app/services/outcome_service.py
index 42460d0..5bb6a79 100644
--- a/app/services/outcome_service.py
+++ b/app/services/outcome_service.py
@@ -16,7 +16,7 @@ from __future__ import annotations
 
 import logging
 from dataclasses import dataclass
-from datetime import date, datetime, timezone
+from datetime import date, datetime, timedelta, timezone
 
 from sqlalchemy import select
 from sqlalchemy.ext.asyncio import AsyncSession
@@ -34,6 +34,13 @@ OUTCOME_EXPIRED = "expired"
 
 DEFAULT_MAX_BARS = 30
 
+# A setup's outcome is only unbiased once its full evaluation window has elapsed:
+# until then, near stops resolve as losses within days while far targets are still
+# pending, so a young sample skews sharply negative. Only count setups detected at
+# least this many CALENDAR days ago (~max_bars trading days, ×1.5 to cover
+# weekends/holidays). Younger setups are reported separately as "maturing".
+_MATURITY_DAYS = int(DEFAULT_MAX_BARS * 1.5)
+
 # Confidence buckets for the performance breakdown
 _CONFIDENCE_BUCKETS = [
     ("<50%", 0.0, 50.0),
@@ -183,7 +190,12 @@ async def get_performance_stats(
     db: AsyncSession,
     config: dict | None = None,
 ) -> dict:
-    """Aggregate outcome statistics over all evaluated trade setups.
+    """Aggregate outcome statistics over the *matured* evaluated trade setups.
+
+    Only setups whose full evaluation window has elapsed (see ``_MATURITY_DAYS``)
+    are counted, so the headline isn't dominated by quick stop-outs while slower
+    winners are still in flight. ``maturing`` reports how many are excluded for
+    being too young.
 
     avg_r is the expectancy per trade in R-multiples (win = +rr_ratio,
     loss = -1R, expired = 0R). A positive avg_r means the signals have
@@ -197,13 +209,23 @@ async def get_performance_stats(
     result = await db.execute(
         select(TradeSetup).where(TradeSetup.actual_outcome.is_not(None))
     )
-    evaluated = list(result.scalars().all())
+    evaluated_all = list(result.scalars().all())
+
+    # Matured cohort only — see _MATURITY_DAYS. Setups whose window hasn't fully
+    # elapsed are excluded so quick stop-outs can't drag the headline negative
+    # while their slower-to-resolve winners are still in flight.
+    cutoff_date = (datetime.now(timezone.utc) - timedelta(days=_MATURITY_DAYS)).date()
+    evaluated = [s for s in evaluated_all if s.detected_at.date() <= cutoff_date]
 
     pending_result = await db.execute(
         select(TradeSetup.id).where(TradeSetup.actual_outcome.is_(None))
     )
     pending_count = len(pending_result.scalars().all())
 
+    # Still inside their measurement window (excluded above so they can't bias the
+    # stats): young setups that already resolved + everything still pending.
+    maturing_count = (len(evaluated_all) - len(evaluated)) + pending_count
+
     if config is not None:
         qualified = [s for s in evaluated if setup_qualifies(s, config)]
     else:
@@ -229,6 +251,7 @@ async def get_performance_stats(
     return {
         "overall": _bucket_stats(qualified),
         "pending": pending_count,
+        "maturing": maturing_count,
         "by_direction": {k: _bucket_stats(v) for k, v in sorted(by_direction.items())},
         "by_action": {k: _bucket_stats(v) for k, v in sorted(by_action.items())},
         "by_confidence": {
diff --git a/frontend/src/components/signals/TrackRecordPanel.tsx b/frontend/src/components/signals/TrackRecordPanel.tsx
index a536cc8..719a8ec 100644
--- a/frontend/src/components/signals/TrackRecordPanel.tsx
+++ b/frontend/src/components/signals/TrackRecordPanel.tsx
@@ -171,7 +171,10 @@ export function TrackRecordPanel() {
             neither level hit within 30 trading days <span className="text-gray-300">expire</span> at
             0R. Avg R is the expectancy per trade: wins earn their R:R ratio, losses cost −1R — a
             positive value means the signals have been profitable on a risk-adjusted basis. The
-            evaluator runs nightly after OHLCV collection.
+            evaluator runs nightly after OHLCV collection. Only setups whose full 30-day window has
+            elapsed are counted — younger ones show as <span className="text-gray-300">maturing</span>,
+            since near stops resolve in days while far targets need time, so early numbers would skew
+            negative.
           </p>
         </Disclosure>
         <div className="flex shrink-0 items-center gap-2">
@@ -198,10 +201,12 @@ export function TrackRecordPanel() {
 
       {data && data.overall.total === 0 && (
         <Callout variant="empty">
-          {qualifiedOnly
-            ? 'No evaluated setups meet the activation thresholds yet. Untick "Qualified signals only" to see all evaluated setups, or wait for more outcomes.'
-            : 'No evaluated setups yet. Outcomes appear once setups are old enough for their stop or target to be hit — the evaluator runs nightly, or click Evaluate Now.'}
-          {data.pending > 0 && ` ${data.pending} setup${data.pending === 1 ? '' : 's'} pending evaluation.`}
+          {data.maturing > 0
+            ? `No setups have completed their ~30-day evaluation window yet — ${data.maturing} still maturing. ` +
+              'Stats appear once a setup’s full window has elapsed; counting them earlier would skew toward quick stop-outs.'
+            : qualifiedOnly
+              ? 'No matured setups meet the activation thresholds yet. Untick "Qualified signals only" to see all, or wait for more outcomes.'
+              : 'No matured setups yet. Outcomes appear once setups complete their evaluation window — the evaluator runs nightly, or click Evaluate Now.'}
         </Callout>
       )}
 
@@ -226,9 +231,9 @@ export function TrackRecordPanel() {
               sub="cumulative risk-adjusted result"
             />
             <StatCard
-              label="Evaluated"
+              label="Matured"
               value={String(data.overall.total)}
-              sub={`${data.pending} pending · ${data.overall.expired} expired`}
+              sub={`${data.maturing} maturing · ${data.overall.expired} expired`}
             />
           </div>
 
diff --git a/frontend/src/lib/types.ts b/frontend/src/lib/types.ts
index 2b18666..8765341 100644
--- a/frontend/src/lib/types.ts
+++ b/frontend/src/lib/types.ts
@@ -153,6 +153,7 @@ export interface OutcomeBucketStats {
 export interface PerformanceStats {
   overall: OutcomeBucketStats;
   pending: number;
+  maturing: number;
   by_direction: Record<string, OutcomeBucketStats>;
   by_action: Record<string, OutcomeBucketStats>;
   by_confidence: Record<string, OutcomeBucketStats>;
diff --git a/tests/unit/test_outcome_service.py b/tests/unit/test_outcome_service.py
index 4bdcef4..408dfb4 100644
--- a/tests/unit/test_outcome_service.py
+++ b/tests/unit/test_outcome_service.py
@@ -317,3 +317,26 @@ class TestGetPerformanceStats:
 
         stats = await get_performance_stats(db_session)
         assert stats["overall"]["total"] == 1
+
+    async def test_immature_setups_excluded_and_counted_as_maturing(self, db_session: AsyncSession):
+        ticker = await _make_ticker(db_session)
+        now = datetime.now(timezone.utc)
+        # Matured (detected well beyond the window) → counted in the stats.
+        db_session.add(_make_setup(
+            ticker, rr=2.0, actual_outcome=OUTCOME_TARGET_HIT, detected=now - timedelta(days=90),
+        ))
+        # Young but already resolved → excluded from stats, reported as maturing.
+        db_session.add(_make_setup(
+            ticker, rr=2.0, actual_outcome=OUTCOME_STOP_HIT, detected=now,
+        ))
+        # Young and still pending → also maturing.
+        db_session.add(_make_setup(ticker, detected=now))
+        await db_session.flush()
+
+        stats = await get_performance_stats(db_session)
+
+        assert stats["overall"]["total"] == 1   # only the matured win
+        assert stats["overall"]["wins"] == 1
+        assert stats["overall"]["hit_rate"] == 100.0
+        assert stats["pending"] == 1
+        assert stats["maturing"] == 2           # young resolved + pending