From 7e9a6cd7ece1ced5b737171419f0d305c3fccde7 Mon Sep 17 00:00:00 2001 From: Dennis Thiessen Date: Sun, 28 Jun 2026 13:41:48 +0200 Subject: [PATCH] fix: only count matured setups in the live track record MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The outcome stats were dominated by quick stop-outs: near stops resolve as losses within days while far targets take weeks, so a young sample (mostly pending, 0 expired) skewed sharply negative (e.g. 13.8% hit / -0.46R vs the backtest's 35.8% / +0.18R) — a maturation artifact, not a real result. get_performance_stats now counts only setups whose full ~30-day window has elapsed (_MATURITY_DAYS), so winners had as long as losers (unbiased, and comparable to the backtest). A new `maturing` count reports the younger setups held back. The Track Record UI relabels "Evaluated" -> "Matured", shows the maturing count, and explains the window in the empty state + methodology note. Co-Authored-By: Claude Opus 4.8 --- app/services/outcome_service.py | 29 +++++++++++++++++-- .../components/signals/TrackRecordPanel.tsx | 19 +++++++----- frontend/src/lib/types.ts | 1 + tests/unit/test_outcome_service.py | 23 +++++++++++++++ 4 files changed, 62 insertions(+), 10 deletions(-) diff --git a/app/services/outcome_service.py b/app/services/outcome_service.py index 42460d0..5bb6a79 100644 --- a/app/services/outcome_service.py +++ b/app/services/outcome_service.py @@ -16,7 +16,7 @@ from __future__ import annotations import logging from dataclasses import dataclass -from datetime import date, datetime, timezone +from datetime import date, datetime, timedelta, timezone from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession @@ -34,6 +34,13 @@ OUTCOME_EXPIRED = "expired" DEFAULT_MAX_BARS = 30 +# A setup's outcome is only unbiased once its full evaluation window has elapsed: +# until then, near stops resolve as losses within days while far targets are still +# pending, so a young sample skews sharply negative. Only count setups detected at +# least this many CALENDAR days ago (~max_bars trading days, ×1.5 to cover +# weekends/holidays). Younger setups are reported separately as "maturing". +_MATURITY_DAYS = int(DEFAULT_MAX_BARS * 1.5) + # Confidence buckets for the performance breakdown _CONFIDENCE_BUCKETS = [ ("<50%", 0.0, 50.0), @@ -183,7 +190,12 @@ async def get_performance_stats( db: AsyncSession, config: dict | None = None, ) -> dict: - """Aggregate outcome statistics over all evaluated trade setups. + """Aggregate outcome statistics over the *matured* evaluated trade setups. + + Only setups whose full evaluation window has elapsed (see ``_MATURITY_DAYS``) + are counted, so the headline isn't dominated by quick stop-outs while slower + winners are still in flight. ``maturing`` reports how many are excluded for + being too young. avg_r is the expectancy per trade in R-multiples (win = +rr_ratio, loss = -1R, expired = 0R). A positive avg_r means the signals have @@ -197,13 +209,23 @@ async def get_performance_stats( result = await db.execute( select(TradeSetup).where(TradeSetup.actual_outcome.is_not(None)) ) - evaluated = list(result.scalars().all()) + evaluated_all = list(result.scalars().all()) + + # Matured cohort only — see _MATURITY_DAYS. Setups whose window hasn't fully + # elapsed are excluded so quick stop-outs can't drag the headline negative + # while their slower-to-resolve winners are still in flight. + cutoff_date = (datetime.now(timezone.utc) - timedelta(days=_MATURITY_DAYS)).date() + evaluated = [s for s in evaluated_all if s.detected_at.date() <= cutoff_date] pending_result = await db.execute( select(TradeSetup.id).where(TradeSetup.actual_outcome.is_(None)) ) pending_count = len(pending_result.scalars().all()) + # Still inside their measurement window (excluded above so they can't bias the + # stats): young setups that already resolved + everything still pending. + maturing_count = (len(evaluated_all) - len(evaluated)) + pending_count + if config is not None: qualified = [s for s in evaluated if setup_qualifies(s, config)] else: @@ -229,6 +251,7 @@ async def get_performance_stats( return { "overall": _bucket_stats(qualified), "pending": pending_count, + "maturing": maturing_count, "by_direction": {k: _bucket_stats(v) for k, v in sorted(by_direction.items())}, "by_action": {k: _bucket_stats(v) for k, v in sorted(by_action.items())}, "by_confidence": { diff --git a/frontend/src/components/signals/TrackRecordPanel.tsx b/frontend/src/components/signals/TrackRecordPanel.tsx index a536cc8..719a8ec 100644 --- a/frontend/src/components/signals/TrackRecordPanel.tsx +++ b/frontend/src/components/signals/TrackRecordPanel.tsx @@ -171,7 +171,10 @@ export function TrackRecordPanel() { neither level hit within 30 trading days expire at 0R. Avg R is the expectancy per trade: wins earn their R:R ratio, losses cost −1R — a positive value means the signals have been profitable on a risk-adjusted basis. The - evaluator runs nightly after OHLCV collection. + evaluator runs nightly after OHLCV collection. Only setups whose full 30-day window has + elapsed are counted — younger ones show as maturing, + since near stops resolve in days while far targets need time, so early numbers would skew + negative.

@@ -198,10 +201,12 @@ export function TrackRecordPanel() { {data && data.overall.total === 0 && ( - {qualifiedOnly - ? 'No evaluated setups meet the activation thresholds yet. Untick "Qualified signals only" to see all evaluated setups, or wait for more outcomes.' - : 'No evaluated setups yet. Outcomes appear once setups are old enough for their stop or target to be hit — the evaluator runs nightly, or click Evaluate Now.'} - {data.pending > 0 && ` ${data.pending} setup${data.pending === 1 ? '' : 's'} pending evaluation.`} + {data.maturing > 0 + ? `No setups have completed their ~30-day evaluation window yet — ${data.maturing} still maturing. ` + + 'Stats appear once a setup’s full window has elapsed; counting them earlier would skew toward quick stop-outs.' + : qualifiedOnly + ? 'No matured setups meet the activation thresholds yet. Untick "Qualified signals only" to see all, or wait for more outcomes.' + : 'No matured setups yet. Outcomes appear once setups complete their evaluation window — the evaluator runs nightly, or click Evaluate Now.'} )} @@ -226,9 +231,9 @@ export function TrackRecordPanel() { sub="cumulative risk-adjusted result" />
diff --git a/frontend/src/lib/types.ts b/frontend/src/lib/types.ts index 2b18666..8765341 100644 --- a/frontend/src/lib/types.ts +++ b/frontend/src/lib/types.ts @@ -153,6 +153,7 @@ export interface OutcomeBucketStats { export interface PerformanceStats { overall: OutcomeBucketStats; pending: number; + maturing: number; by_direction: Record; by_action: Record; by_confidence: Record; diff --git a/tests/unit/test_outcome_service.py b/tests/unit/test_outcome_service.py index 4bdcef4..408dfb4 100644 --- a/tests/unit/test_outcome_service.py +++ b/tests/unit/test_outcome_service.py @@ -317,3 +317,26 @@ class TestGetPerformanceStats: stats = await get_performance_stats(db_session) assert stats["overall"]["total"] == 1 + + async def test_immature_setups_excluded_and_counted_as_maturing(self, db_session: AsyncSession): + ticker = await _make_ticker(db_session) + now = datetime.now(timezone.utc) + # Matured (detected well beyond the window) → counted in the stats. + db_session.add(_make_setup( + ticker, rr=2.0, actual_outcome=OUTCOME_TARGET_HIT, detected=now - timedelta(days=90), + )) + # Young but already resolved → excluded from stats, reported as maturing. + db_session.add(_make_setup( + ticker, rr=2.0, actual_outcome=OUTCOME_STOP_HIT, detected=now, + )) + # Young and still pending → also maturing. + db_session.add(_make_setup(ticker, detected=now)) + await db_session.flush() + + stats = await get_performance_stats(db_session) + + assert stats["overall"]["total"] == 1 # only the matured win + assert stats["overall"]["wins"] == 1 + assert stats["overall"]["hit_rate"] == 100.0 + assert stats["pending"] == 1 + assert stats["maturing"] == 2 # young resolved + pending