fix: only count matured setups in the live track record
Deploy / lint (push) Successful in 8s
Deploy / test (push) Successful in 1m1s
Deploy / deploy (push) Successful in 36s

The outcome stats were dominated by quick stop-outs: near stops resolve as losses
within days while far targets take weeks, so a young sample (mostly pending,
0 expired) skewed sharply negative (e.g. 13.8% hit / -0.46R vs the backtest's
35.8% / +0.18R) — a maturation artifact, not a real result.

get_performance_stats now counts only setups whose full ~30-day window has
elapsed (_MATURITY_DAYS), so winners had as long as losers (unbiased, and
comparable to the backtest). A new `maturing` count reports the younger setups
held back. The Track Record UI relabels "Evaluated" -> "Matured", shows the
maturing count, and explains the window in the empty state + methodology note.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-28 13:41:48 +02:00
parent 8bcbbfcfd0
commit 7e9a6cd7ec
4 changed files with 62 additions and 10 deletions
+26 -3
View File
@@ -16,7 +16,7 @@ from __future__ import annotations
import logging import logging
from dataclasses import dataclass from dataclasses import dataclass
from datetime import date, datetime, timezone from datetime import date, datetime, timedelta, timezone
from sqlalchemy import select from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
@@ -34,6 +34,13 @@ OUTCOME_EXPIRED = "expired"
DEFAULT_MAX_BARS = 30 DEFAULT_MAX_BARS = 30
# A setup's outcome is only unbiased once its full evaluation window has elapsed:
# until then, near stops resolve as losses within days while far targets are still
# pending, so a young sample skews sharply negative. Only count setups detected at
# least this many CALENDAR days ago (~max_bars trading days, ×1.5 to cover
# weekends/holidays). Younger setups are reported separately as "maturing".
_MATURITY_DAYS = int(DEFAULT_MAX_BARS * 1.5)
# Confidence buckets for the performance breakdown # Confidence buckets for the performance breakdown
_CONFIDENCE_BUCKETS = [ _CONFIDENCE_BUCKETS = [
("<50%", 0.0, 50.0), ("<50%", 0.0, 50.0),
@@ -183,7 +190,12 @@ async def get_performance_stats(
db: AsyncSession, db: AsyncSession,
config: dict | None = None, config: dict | None = None,
) -> dict: ) -> dict:
"""Aggregate outcome statistics over all evaluated trade setups. """Aggregate outcome statistics over the *matured* evaluated trade setups.
Only setups whose full evaluation window has elapsed (see ``_MATURITY_DAYS``)
are counted, so the headline isn't dominated by quick stop-outs while slower
winners are still in flight. ``maturing`` reports how many are excluded for
being too young.
avg_r is the expectancy per trade in R-multiples (win = +rr_ratio, avg_r is the expectancy per trade in R-multiples (win = +rr_ratio,
loss = -1R, expired = 0R). A positive avg_r means the signals have loss = -1R, expired = 0R). A positive avg_r means the signals have
@@ -197,13 +209,23 @@ async def get_performance_stats(
result = await db.execute( result = await db.execute(
select(TradeSetup).where(TradeSetup.actual_outcome.is_not(None)) select(TradeSetup).where(TradeSetup.actual_outcome.is_not(None))
) )
evaluated = list(result.scalars().all()) evaluated_all = list(result.scalars().all())
# Matured cohort only — see _MATURITY_DAYS. Setups whose window hasn't fully
# elapsed are excluded so quick stop-outs can't drag the headline negative
# while their slower-to-resolve winners are still in flight.
cutoff_date = (datetime.now(timezone.utc) - timedelta(days=_MATURITY_DAYS)).date()
evaluated = [s for s in evaluated_all if s.detected_at.date() <= cutoff_date]
pending_result = await db.execute( pending_result = await db.execute(
select(TradeSetup.id).where(TradeSetup.actual_outcome.is_(None)) select(TradeSetup.id).where(TradeSetup.actual_outcome.is_(None))
) )
pending_count = len(pending_result.scalars().all()) pending_count = len(pending_result.scalars().all())
# Still inside their measurement window (excluded above so they can't bias the
# stats): young setups that already resolved + everything still pending.
maturing_count = (len(evaluated_all) - len(evaluated)) + pending_count
if config is not None: if config is not None:
qualified = [s for s in evaluated if setup_qualifies(s, config)] qualified = [s for s in evaluated if setup_qualifies(s, config)]
else: else:
@@ -229,6 +251,7 @@ async def get_performance_stats(
return { return {
"overall": _bucket_stats(qualified), "overall": _bucket_stats(qualified),
"pending": pending_count, "pending": pending_count,
"maturing": maturing_count,
"by_direction": {k: _bucket_stats(v) for k, v in sorted(by_direction.items())}, "by_direction": {k: _bucket_stats(v) for k, v in sorted(by_direction.items())},
"by_action": {k: _bucket_stats(v) for k, v in sorted(by_action.items())}, "by_action": {k: _bucket_stats(v) for k, v in sorted(by_action.items())},
"by_confidence": { "by_confidence": {
@@ -171,7 +171,10 @@ export function TrackRecordPanel() {
neither level hit within 30 trading days <span className="text-gray-300">expire</span> at neither level hit within 30 trading days <span className="text-gray-300">expire</span> at
0R. Avg R is the expectancy per trade: wins earn their R:R ratio, losses cost 1R a 0R. Avg R is the expectancy per trade: wins earn their R:R ratio, losses cost 1R a
positive value means the signals have been profitable on a risk-adjusted basis. The positive value means the signals have been profitable on a risk-adjusted basis. The
evaluator runs nightly after OHLCV collection. evaluator runs nightly after OHLCV collection. Only setups whose full 30-day window has
elapsed are counted younger ones show as <span className="text-gray-300">maturing</span>,
since near stops resolve in days while far targets need time, so early numbers would skew
negative.
</p> </p>
</Disclosure> </Disclosure>
<div className="flex shrink-0 items-center gap-2"> <div className="flex shrink-0 items-center gap-2">
@@ -198,10 +201,12 @@ export function TrackRecordPanel() {
{data && data.overall.total === 0 && ( {data && data.overall.total === 0 && (
<Callout variant="empty"> <Callout variant="empty">
{qualifiedOnly {data.maturing > 0
? 'No evaluated setups meet the activation thresholds yet. Untick "Qualified signals only" to see all evaluated setups, or wait for more outcomes.' ? `No setups have completed their ~30-day evaluation window yet — ${data.maturing} still maturing. ` +
: 'No evaluated setups yet. Outcomes appear once setups are old enough for their stop or target to be hit — the evaluator runs nightly, or click Evaluate Now.'} 'Stats appear once a setups full window has elapsed; counting them earlier would skew toward quick stop-outs.'
{data.pending > 0 && ` ${data.pending} setup${data.pending === 1 ? '' : 's'} pending evaluation.`} : qualifiedOnly
? 'No matured setups meet the activation thresholds yet. Untick "Qualified signals only" to see all, or wait for more outcomes.'
: 'No matured setups yet. Outcomes appear once setups complete their evaluation window — the evaluator runs nightly, or click Evaluate Now.'}
</Callout> </Callout>
)} )}
@@ -226,9 +231,9 @@ export function TrackRecordPanel() {
sub="cumulative risk-adjusted result" sub="cumulative risk-adjusted result"
/> />
<StatCard <StatCard
label="Evaluated" label="Matured"
value={String(data.overall.total)} value={String(data.overall.total)}
sub={`${data.pending} pending · ${data.overall.expired} expired`} sub={`${data.maturing} maturing · ${data.overall.expired} expired`}
/> />
</div> </div>
+1
View File
@@ -153,6 +153,7 @@ export interface OutcomeBucketStats {
export interface PerformanceStats { export interface PerformanceStats {
overall: OutcomeBucketStats; overall: OutcomeBucketStats;
pending: number; pending: number;
maturing: number;
by_direction: Record<string, OutcomeBucketStats>; by_direction: Record<string, OutcomeBucketStats>;
by_action: Record<string, OutcomeBucketStats>; by_action: Record<string, OutcomeBucketStats>;
by_confidence: Record<string, OutcomeBucketStats>; by_confidence: Record<string, OutcomeBucketStats>;
+23
View File
@@ -317,3 +317,26 @@ class TestGetPerformanceStats:
stats = await get_performance_stats(db_session) stats = await get_performance_stats(db_session)
assert stats["overall"]["total"] == 1 assert stats["overall"]["total"] == 1
async def test_immature_setups_excluded_and_counted_as_maturing(self, db_session: AsyncSession):
ticker = await _make_ticker(db_session)
now = datetime.now(timezone.utc)
# Matured (detected well beyond the window) → counted in the stats.
db_session.add(_make_setup(
ticker, rr=2.0, actual_outcome=OUTCOME_TARGET_HIT, detected=now - timedelta(days=90),
))
# Young but already resolved → excluded from stats, reported as maturing.
db_session.add(_make_setup(
ticker, rr=2.0, actual_outcome=OUTCOME_STOP_HIT, detected=now,
))
# Young and still pending → also maturing.
db_session.add(_make_setup(ticker, detected=now))
await db_session.flush()
stats = await get_performance_stats(db_session)
assert stats["overall"]["total"] == 1 # only the matured win
assert stats["overall"]["wins"] == 1
assert stats["overall"]["hit_rate"] == 100.0
assert stats["pending"] == 1
assert stats["maturing"] == 2 # young resolved + pending