feat: collapse track record into a live-vs-backtest check
Deploy / lint (push) Successful in 7s
Deploy / test (push) Successful in 57s
Deploy / deploy (push) Successful in 34s

The outcome section measures the same thing as the backtest with the same code
and data — its only unique value is catching when the live system drifts from
the backtest (a bug, config/data drift, or look-ahead). So reframe it as exactly
that: a one-line "Live X R vs Backtest Y R · n matured · tracking ✓ / drift ⚠"
indicator (like-for-like with the qualified toggle), with the stat cards and
By-Action/By-Confidence tables moved into a collapsed "Outcome details"
disclosure. Drop the always-empty By-Direction table.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-28 13:58:15 +02:00
parent 7e9a6cd7ec
commit 6c2e45377c
@@ -3,6 +3,7 @@ import { useMutation, useQueryClient } from '@tanstack/react-query';
import { useActivation } from '../../hooks/useActivation';
import { activationSummary } from '../../lib/qualification';
import { usePerformance } from '../../hooks/usePerformance';
import { useBacktestReport } from '../../hooks/useMarketRegime';
import { triggerJob, resetTrackRecord } from '../../api/admin';
import { Button } from '../ui/Button';
import { Callout } from '../ui/Callout';
@@ -15,6 +16,14 @@ import { BacktestPanel } from './BacktestPanel';
import { MyTradesPanel } from './MyTradesPanel';
import type { OutcomeBucketStats } from '../../lib/types';
// Need at least this many matured setups before a live-vs-backtest verdict means
// anything; below it the live sample is too noisy to compare.
const MIN_MATURED = 20;
// Live expectancy this far (in R) below the backtest counts as drift, not noise.
const DRIFT_TOLERANCE_R = 0.2;
type TrackingStatus = 'building' | 'tracking' | 'drift' | 'no-backtest';
function fmtR(value: number | null): string {
if (value === null) return '—';
return `${value > 0 ? '+' : ''}${value.toFixed(2)}R`;
@@ -31,6 +40,17 @@ function rColor(value: number | null): string {
return 'text-gray-300';
}
function VerdictChip({ status }: { status: TrackingStatus }) {
const styles: Record<TrackingStatus, { cls: string; label: string }> = {
tracking: { cls: 'border-emerald-500/30 bg-emerald-500/15 text-emerald-300', label: '✓ tracking' },
drift: { cls: 'border-amber-500/30 bg-amber-500/15 text-amber-300', label: '⚠ drift' },
building: { cls: 'border-white/10 bg-white/[0.05] text-gray-400', label: 'building' },
'no-backtest': { cls: 'border-white/10 bg-white/[0.05] text-gray-400', label: 'no backtest' },
};
const s = styles[status];
return <span className={`shrink-0 rounded-full border px-2.5 py-1 text-xs font-medium ${s.cls}`}>{s.label}</span>;
}
function StatCard({ label, value, valueClass = 'text-gray-100', sub }: {
label: string;
value: string;
@@ -57,7 +77,7 @@ function BreakdownTable({ rows, labelHeader, mapLabel }: {
}) {
const entries = Object.entries(rows);
if (entries.length === 0) {
return <Callout variant="empty">No evaluated setups in this breakdown yet.</Callout>;
return <Callout variant="empty">No matured setups in this breakdown yet.</Callout>;
}
return (
<div className="glass overflow-x-auto">
@@ -100,6 +120,7 @@ export function TrackRecordPanel() {
const { data, isLoading, isError, error } = usePerformance(
qualifiedOnly ? { qualified_only: true } : undefined,
);
const backtest = useBacktestReport();
const queryClient = useQueryClient();
const toast = useToast();
@@ -137,119 +158,145 @@ export function TrackRecordPanel() {
}
};
// Live (matured cohort) vs the backtest, like-for-like with the qualified toggle.
const live = data?.overall ?? null;
const btBucket = qualifiedOnly ? backtest.data?.overall_qualified : backtest.data?.overall_all;
const liveAvgR = live?.avg_r ?? null;
const liveN = live?.total ?? 0;
const btAvgR = btBucket?.avg_r ?? null;
let status: TrackingStatus = 'building';
if (liveAvgR != null && liveN >= MIN_MATURED) {
status = btAvgR == null ? 'no-backtest' : liveAvgR >= btAvgR - DRIFT_TOLERANCE_R ? 'tracking' : 'drift';
}
const verdictNote: Record<TrackingStatus, string> = {
building: `Not enough matured setups yet (need ~${MIN_MATURED}). Only setups whose full ~30-day window has elapsed are counted — the rest are still maturing. Until then, the backtest is your edge estimate; this becomes a live check as setups age past ~6 weeks.`,
'no-backtest': 'Run the backtest below to get a baseline to compare the live record against.',
tracking: 'Live setups are resolving in line with the backtest — the running system is faithfully implementing it (no look-ahead, config or data drift).',
drift: 'Live expectancy is running materially below the backtest. Could be small-sample noise, a regime shift, or a config/data/look-ahead gap between live and the backtest — worth a look.',
};
return (
<div className="space-y-6">
{/* Your real, realized results come first; the signal/theoretical record follows. */}
{/* Your real, realized results come first; the live-vs-backtest check follows. */}
<MyTradesPanel />
<div className="border-t border-white/[0.06]" />
<div className="glass-sm flex flex-wrap items-center justify-between gap-3 px-4 py-3">
<label className="flex cursor-pointer items-center gap-2.5 text-sm text-gray-300">
<input
type="checkbox"
checked={qualifiedOnly}
onChange={(e) => setQualifiedOnly(e.target.checked)}
className="h-4 w-4 cursor-pointer accent-blue-400"
/>
<span>
Qualified signals only
{activation.data && (
<span className="num ml-2 text-xs text-gray-500">{activationSummary(activation.data)}</span>
)}
</span>
</label>
<p className="text-xs text-gray-500">Confidence breakdown always covers all setups.</p>
</div>
<div className="flex items-start justify-between gap-4">
<Disclosure summary="How outcomes are measured">
<p className="text-xs text-gray-400">
Each setup is replayed against the daily bars after its detection: a{' '}
<span className="text-emerald-400">win</span> means the target was reached before the
stop, a <span className="text-red-400">loss</span> means the stop was hit first (bars
where both levels fall inside the same day count conservatively as losses). Setups with
neither level hit within 30 trading days <span className="text-gray-300">expire</span> at
0R. Avg R is the expectancy per trade: wins earn their R:R ratio, losses cost 1R a
positive value means the signals have been profitable on a risk-adjusted basis. The
evaluator runs nightly after OHLCV collection. Only setups whose full 30-day window has
elapsed are counted younger ones show as <span className="text-gray-300">maturing</span>,
since near stops resolve in days while far targets need time, so early numbers would skew
negative.
</p>
</Disclosure>
<div className="flex shrink-0 items-center gap-2">
<Button onClick={() => evaluateMutation.mutate()} loading={evaluateMutation.isPending}>
{evaluateMutation.isPending ? 'Evaluating…' : 'Evaluate Now'}
</Button>
<Button variant="danger" onClick={onReset} loading={resetMutation.isPending}>
{resetMutation.isPending ? 'Resetting…' : 'Reset'}
</Button>
</div>
</div>
{isLoading && (
<div className="grid gap-4 sm:grid-cols-2 lg:grid-cols-4">
<SkeletonCard /><SkeletonCard /><SkeletonCard /><SkeletonCard />
</div>
)}
{isError && (
<Callout variant="error">
{error instanceof Error ? error.message : 'Failed to load performance stats'}
</Callout>
)}
{data && data.overall.total === 0 && (
<Callout variant="empty">
{data.maturing > 0
? `No setups have completed their ~30-day evaluation window yet — ${data.maturing} still maturing. ` +
'Stats appear once a setups full window has elapsed; counting them earlier would skew toward quick stop-outs.'
: qualifiedOnly
? 'No matured setups meet the activation thresholds yet. Untick "Qualified signals only" to see all, or wait for more outcomes.'
: 'No matured setups yet. Outcomes appear once setups complete their evaluation window — the evaluator runs nightly, or click Evaluate Now.'}
</Callout>
)}
{data && data.overall.total > 0 && (
<>
<div className="grid gap-4 sm:grid-cols-2 lg:grid-cols-4">
<StatCard
label="Hit Rate"
value={fmtPct(data.overall.hit_rate)}
sub={`${data.overall.wins} wins / ${data.overall.losses} losses`}
/>
<StatCard
label="Expectancy"
value={fmtR(data.overall.avg_r)}
valueClass={rColor(data.overall.avg_r)}
sub="average R per trade"
/>
<StatCard
label="Total R"
value={fmtR(data.overall.total_r)}
valueClass={rColor(data.overall.total_r)}
sub="cumulative risk-adjusted result"
/>
<StatCard
label="Matured"
value={String(data.overall.total)}
sub={`${data.maturing} maturing · ${data.overall.expired} expired`}
/>
<Section title="Live vs Backtest" hint="is the live system tracking the backtest?">
{isError ? (
<Callout variant="error">
{error instanceof Error ? error.message : 'Failed to load performance stats'}
</Callout>
) : (
<div className="glass-sm space-y-2.5 p-4">
<div className="flex flex-wrap items-center justify-between gap-x-6 gap-y-2">
<div className="flex flex-wrap items-baseline gap-x-5 gap-y-1">
<span className="text-sm text-gray-400">
Live <span className={`num font-semibold ${rColor(liveAvgR)}`}>{fmtR(liveAvgR)}</span>
</span>
<span className="text-sm text-gray-400">
Backtest <span className={`num font-semibold ${rColor(btAvgR)}`}>{fmtR(btAvgR)}</span>
</span>
<span className="text-xs text-gray-500">
{liveN} matured{data ? ` · ${data.maturing} maturing` : ''} · {qualifiedOnly ? 'qualified' : 'all setups'}
</span>
</div>
<VerdictChip status={status} />
</div>
<p className="text-[11px] leading-relaxed text-gray-500">{verdictNote[status]}</p>
</div>
)}
</Section>
<Section title="By Direction">
<BreakdownTable rows={data.by_direction} labelHeader="Direction" />
</Section>
<Disclosure summary="Outcome details (matured cohort)">
<div className="space-y-4 pt-1">
<label className="flex w-fit cursor-pointer items-center gap-2.5 text-sm text-gray-300">
<input
type="checkbox"
checked={qualifiedOnly}
onChange={(e) => setQualifiedOnly(e.target.checked)}
className="h-4 w-4 cursor-pointer accent-blue-400"
/>
<span>
Qualified signals only
{activation.data && (
<span className="num ml-2 text-xs text-gray-500">{activationSummary(activation.data)}</span>
)}
</span>
</label>
<Section title="By Recommended Action">
<BreakdownTable rows={data.by_action} labelHeader="Action" mapLabel={actionLabel} />
</Section>
{isLoading && (
<div className="grid gap-4 sm:grid-cols-2 lg:grid-cols-4">
<SkeletonCard /><SkeletonCard /><SkeletonCard /><SkeletonCard />
</div>
)}
<Section title="By Confidence" hint="at detection time">
<BreakdownTable rows={data.by_confidence} labelHeader="Confidence" />
</Section>
</>
)}
{data && data.overall.total === 0 && (
<Callout variant="empty">
{data.maturing > 0
? `No setups have completed their ~30-day window yet — ${data.maturing} still maturing. ` +
'Counting them earlier would skew toward quick stop-outs.'
: 'No matured setups yet. Outcomes appear once setups complete their evaluation window — the evaluator runs nightly, or click Evaluate Now.'}
</Callout>
)}
{data && data.overall.total > 0 && (
<>
<div className="grid gap-4 sm:grid-cols-2 lg:grid-cols-4">
<StatCard
label="Hit Rate"
value={fmtPct(data.overall.hit_rate)}
sub={`${data.overall.wins} wins / ${data.overall.losses} losses`}
/>
<StatCard
label="Expectancy"
value={fmtR(data.overall.avg_r)}
valueClass={rColor(data.overall.avg_r)}
sub="average R per trade"
/>
<StatCard
label="Total R"
value={fmtR(data.overall.total_r)}
valueClass={rColor(data.overall.total_r)}
sub="cumulative risk-adjusted result"
/>
<StatCard
label="Matured"
value={String(data.overall.total)}
sub={`${data.maturing} maturing · ${data.overall.expired} expired`}
/>
</div>
<Section title="By Recommended Action">
<BreakdownTable rows={data.by_action} labelHeader="Action" mapLabel={actionLabel} />
</Section>
<Section title="By Confidence" hint="at detection time · all setups">
<BreakdownTable rows={data.by_confidence} labelHeader="Confidence" />
</Section>
</>
)}
<div className="flex flex-wrap items-center justify-between gap-3 border-t border-white/[0.06] pt-3">
<p className="max-w-2xl text-xs text-gray-500">
Each setup is replayed against the daily bars after detection: target before stop = win,
stop first = loss (both in one bar counts conservatively as a loss), neither within 30
trading days = expired at 0R. Only setups whose full window has elapsed are counted; younger
ones are still <span className="text-gray-300">maturing</span> (near stops resolve fast, far
targets need time, so early numbers would skew negative). The evaluator runs nightly.
</p>
<div className="flex shrink-0 items-center gap-2">
<Button onClick={() => evaluateMutation.mutate()} loading={evaluateMutation.isPending}>
{evaluateMutation.isPending ? 'Evaluating…' : 'Evaluate Now'}
</Button>
<Button variant="danger" onClick={onReset} loading={resetMutation.isPending}>
{resetMutation.isPending ? 'Resetting…' : 'Reset'}
</Button>
</div>
</div>
</div>
</Disclosure>
<div className="border-t border-white/[0.06] pt-2" />
<BacktestPanel />