fix: only count matured setups in the live track record
The outcome stats were dominated by quick stop-outs: near stops resolve as losses within days while far targets take weeks, so a young sample (mostly pending, 0 expired) skewed sharply negative (e.g. 13.8% hit / -0.46R vs the backtest's 35.8% / +0.18R) — a maturation artifact, not a real result. get_performance_stats now counts only setups whose full ~30-day window has elapsed (_MATURITY_DAYS), so winners had as long as losers (unbiased, and comparable to the backtest). A new `maturing` count reports the younger setups held back. The Track Record UI relabels "Evaluated" -> "Matured", shows the maturing count, and explains the window in the empty state + methodology note. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -16,7 +16,7 @@ from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from datetime import date, datetime, timezone
|
||||
from datetime import date, datetime, timedelta, timezone
|
||||
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
@@ -34,6 +34,13 @@ OUTCOME_EXPIRED = "expired"
|
||||
|
||||
DEFAULT_MAX_BARS = 30
|
||||
|
||||
# A setup's outcome is only unbiased once its full evaluation window has elapsed:
|
||||
# until then, near stops resolve as losses within days while far targets are still
|
||||
# pending, so a young sample skews sharply negative. Only count setups detected at
|
||||
# least this many CALENDAR days ago (~max_bars trading days, ×1.5 to cover
|
||||
# weekends/holidays). Younger setups are reported separately as "maturing".
|
||||
_MATURITY_DAYS = int(DEFAULT_MAX_BARS * 1.5)
|
||||
|
||||
# Confidence buckets for the performance breakdown
|
||||
_CONFIDENCE_BUCKETS = [
|
||||
("<50%", 0.0, 50.0),
|
||||
@@ -183,7 +190,12 @@ async def get_performance_stats(
|
||||
db: AsyncSession,
|
||||
config: dict | None = None,
|
||||
) -> dict:
|
||||
"""Aggregate outcome statistics over all evaluated trade setups.
|
||||
"""Aggregate outcome statistics over the *matured* evaluated trade setups.
|
||||
|
||||
Only setups whose full evaluation window has elapsed (see ``_MATURITY_DAYS``)
|
||||
are counted, so the headline isn't dominated by quick stop-outs while slower
|
||||
winners are still in flight. ``maturing`` reports how many are excluded for
|
||||
being too young.
|
||||
|
||||
avg_r is the expectancy per trade in R-multiples (win = +rr_ratio,
|
||||
loss = -1R, expired = 0R). A positive avg_r means the signals have
|
||||
@@ -197,13 +209,23 @@ async def get_performance_stats(
|
||||
result = await db.execute(
|
||||
select(TradeSetup).where(TradeSetup.actual_outcome.is_not(None))
|
||||
)
|
||||
evaluated = list(result.scalars().all())
|
||||
evaluated_all = list(result.scalars().all())
|
||||
|
||||
# Matured cohort only — see _MATURITY_DAYS. Setups whose window hasn't fully
|
||||
# elapsed are excluded so quick stop-outs can't drag the headline negative
|
||||
# while their slower-to-resolve winners are still in flight.
|
||||
cutoff_date = (datetime.now(timezone.utc) - timedelta(days=_MATURITY_DAYS)).date()
|
||||
evaluated = [s for s in evaluated_all if s.detected_at.date() <= cutoff_date]
|
||||
|
||||
pending_result = await db.execute(
|
||||
select(TradeSetup.id).where(TradeSetup.actual_outcome.is_(None))
|
||||
)
|
||||
pending_count = len(pending_result.scalars().all())
|
||||
|
||||
# Still inside their measurement window (excluded above so they can't bias the
|
||||
# stats): young setups that already resolved + everything still pending.
|
||||
maturing_count = (len(evaluated_all) - len(evaluated)) + pending_count
|
||||
|
||||
if config is not None:
|
||||
qualified = [s for s in evaluated if setup_qualifies(s, config)]
|
||||
else:
|
||||
@@ -229,6 +251,7 @@ async def get_performance_stats(
|
||||
return {
|
||||
"overall": _bucket_stats(qualified),
|
||||
"pending": pending_count,
|
||||
"maturing": maturing_count,
|
||||
"by_direction": {k: _bucket_stats(v) for k, v in sorted(by_direction.items())},
|
||||
"by_action": {k: _bucket_stats(v) for k, v in sorted(by_action.items())},
|
||||
"by_confidence": {
|
||||
|
||||
@@ -171,7 +171,10 @@ export function TrackRecordPanel() {
|
||||
neither level hit within 30 trading days <span className="text-gray-300">expire</span> at
|
||||
0R. Avg R is the expectancy per trade: wins earn their R:R ratio, losses cost −1R — a
|
||||
positive value means the signals have been profitable on a risk-adjusted basis. The
|
||||
evaluator runs nightly after OHLCV collection.
|
||||
evaluator runs nightly after OHLCV collection. Only setups whose full 30-day window has
|
||||
elapsed are counted — younger ones show as <span className="text-gray-300">maturing</span>,
|
||||
since near stops resolve in days while far targets need time, so early numbers would skew
|
||||
negative.
|
||||
</p>
|
||||
</Disclosure>
|
||||
<div className="flex shrink-0 items-center gap-2">
|
||||
@@ -198,10 +201,12 @@ export function TrackRecordPanel() {
|
||||
|
||||
{data && data.overall.total === 0 && (
|
||||
<Callout variant="empty">
|
||||
{qualifiedOnly
|
||||
? 'No evaluated setups meet the activation thresholds yet. Untick "Qualified signals only" to see all evaluated setups, or wait for more outcomes.'
|
||||
: 'No evaluated setups yet. Outcomes appear once setups are old enough for their stop or target to be hit — the evaluator runs nightly, or click Evaluate Now.'}
|
||||
{data.pending > 0 && ` ${data.pending} setup${data.pending === 1 ? '' : 's'} pending evaluation.`}
|
||||
{data.maturing > 0
|
||||
? `No setups have completed their ~30-day evaluation window yet — ${data.maturing} still maturing. ` +
|
||||
'Stats appear once a setup’s full window has elapsed; counting them earlier would skew toward quick stop-outs.'
|
||||
: qualifiedOnly
|
||||
? 'No matured setups meet the activation thresholds yet. Untick "Qualified signals only" to see all, or wait for more outcomes.'
|
||||
: 'No matured setups yet. Outcomes appear once setups complete their evaluation window — the evaluator runs nightly, or click Evaluate Now.'}
|
||||
</Callout>
|
||||
)}
|
||||
|
||||
@@ -226,9 +231,9 @@ export function TrackRecordPanel() {
|
||||
sub="cumulative risk-adjusted result"
|
||||
/>
|
||||
<StatCard
|
||||
label="Evaluated"
|
||||
label="Matured"
|
||||
value={String(data.overall.total)}
|
||||
sub={`${data.pending} pending · ${data.overall.expired} expired`}
|
||||
sub={`${data.maturing} maturing · ${data.overall.expired} expired`}
|
||||
/>
|
||||
</div>
|
||||
|
||||
|
||||
@@ -153,6 +153,7 @@ export interface OutcomeBucketStats {
|
||||
export interface PerformanceStats {
|
||||
overall: OutcomeBucketStats;
|
||||
pending: number;
|
||||
maturing: number;
|
||||
by_direction: Record<string, OutcomeBucketStats>;
|
||||
by_action: Record<string, OutcomeBucketStats>;
|
||||
by_confidence: Record<string, OutcomeBucketStats>;
|
||||
|
||||
@@ -317,3 +317,26 @@ class TestGetPerformanceStats:
|
||||
|
||||
stats = await get_performance_stats(db_session)
|
||||
assert stats["overall"]["total"] == 1
|
||||
|
||||
async def test_immature_setups_excluded_and_counted_as_maturing(self, db_session: AsyncSession):
|
||||
ticker = await _make_ticker(db_session)
|
||||
now = datetime.now(timezone.utc)
|
||||
# Matured (detected well beyond the window) → counted in the stats.
|
||||
db_session.add(_make_setup(
|
||||
ticker, rr=2.0, actual_outcome=OUTCOME_TARGET_HIT, detected=now - timedelta(days=90),
|
||||
))
|
||||
# Young but already resolved → excluded from stats, reported as maturing.
|
||||
db_session.add(_make_setup(
|
||||
ticker, rr=2.0, actual_outcome=OUTCOME_STOP_HIT, detected=now,
|
||||
))
|
||||
# Young and still pending → also maturing.
|
||||
db_session.add(_make_setup(ticker, detected=now))
|
||||
await db_session.flush()
|
||||
|
||||
stats = await get_performance_stats(db_session)
|
||||
|
||||
assert stats["overall"]["total"] == 1 # only the matured win
|
||||
assert stats["overall"]["wins"] == 1
|
||||
assert stats["overall"]["hit_rate"] == 100.0
|
||||
assert stats["pending"] == 1
|
||||
assert stats["maturing"] == 2 # young resolved + pending
|
||||
|
||||
Reference in New Issue
Block a user