Add trade setup outcome tracking and performance stats

Closes the feedback loop on R:R scanner signals: - Nightly outcome_evaluator job replays unresolved setups against daily OHLCV bars: target_hit / stop_hit / ambiguous (same-bar, counted as loss) / expired after OUTCOME_EVALUATION_MAX_BARS (default 30) - Migration 004: evaluated_at + outcome_date on trade_setups - GET /trades/performance: hit rate, expectancy (avg R), total R with breakdowns by direction, recommended action, and confidence bucket - New Performance page (stat cards, breakdown tables, Evaluate Now, methodology disclosure) wired into sidebar and mobile nav - 17 new unit tests for evaluation logic and stats aggregation Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-10 19:23:57 +02:00
parent d69df5df27
commit 21ed83c56c
20 changed files with 859 additions and 5 deletions
@@ -45,6 +45,9 @@ class Settings(BaseSettings):
    default_watchlist_auto_size: int = 10
    default_rr_threshold: float = 1.5

+    # Outcome evaluation: trading days before an undecided setup expires
+    outcome_evaluation_max_bars: int = 30
+
    # Database Pool
    db_pool_size: int = 5
    db_pool_timeout: int = 30
@@ -1,8 +1,8 @@
-from datetime import datetime
+from datetime import date, datetime

 import json

-from sqlalchemy import DateTime, Float, ForeignKey, String, Text
+from sqlalchemy import Date, DateTime, Float, ForeignKey, String, Text
 from sqlalchemy.orm import Mapped, mapped_column, relationship

 from app.database import Base
@@ -32,6 +32,10 @@ class TradeSetup(Base):
    reasoning: Mapped[str | None] = mapped_column(Text, nullable=True)
    risk_level: Mapped[str | None] = mapped_column(String(10), nullable=True)
    actual_outcome: Mapped[str | None] = mapped_column(String(20), nullable=True)
+    evaluated_at: Mapped[datetime | None] = mapped_column(
+        DateTime(timezone=True), nullable=True
+    )
+    outcome_date: Mapped[date | None] = mapped_column(Date, nullable=True)

    ticker = relationship("Ticker", back_populates="trade_setups")

@@ -6,6 +6,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
 from app.dependencies import get_db, require_access
 from app.schemas.common import APIEnvelope
 from app.schemas.trade_setup import RecommendationSummaryResponse, TradeSetupResponse
+from app.services.outcome_service import get_performance_stats
 from app.services.rr_scanner_service import get_trade_setup_history, get_trade_setups

 router = APIRouter(tags=["trades"])
@@ -48,6 +49,20 @@ async def list_trade_setups(
    return APIEnvelope(status="success", data=data)


+@router.get("/trades/performance", response_model=APIEnvelope)
+async def get_trade_performance(
+    _user=Depends(require_access),
+    db: AsyncSession = Depends(get_db),
+) -> APIEnvelope:
+    """Aggregate outcome statistics over evaluated trade setups.
+
+    Outcomes are written by the nightly outcome_evaluator job (win = target
+    hit first, loss = stop hit first, expired = neither within the window).
+    """
+    stats = await get_performance_stats(db)
+    return APIEnvelope(status="success", data=stats)
+
+
@router.get("/trades/{symbol}", response_model=APIEnvelope)
 async def get_ticker_trade_setups(
    symbol: str,
@@ -34,6 +34,7 @@ from app.providers.fundamentals_chain import build_fundamental_provider_chain
 from app.providers.openai_sentiment import OpenAISentimentProvider
 from app.providers.protocol import SentimentData
 from app.services import fundamental_service, ingestion_service, sentiment_service
+from app.services.outcome_service import evaluate_pending_setups
 from app.services.rr_scanner_service import scan_all_tickers
 from app.services.ticker_universe_service import bootstrap_universe

@@ -676,6 +677,52 @@ async def scan_rr() -> None:
        _runtime_finish(job_name, "error", processed=processed, total=total, message=str(exc))


+# ---------------------------------------------------------------------------
+# Job: Outcome Evaluator
+# ---------------------------------------------------------------------------
+
+
+async def evaluate_outcomes() -> None:
+    """Evaluate unresolved trade setups against OHLCV data collected since.
+
+    Writes actual_outcome / outcome_date / evaluated_at on each decided setup.
+    Undecided setups stay pending and are re-checked on the next run.
+    """
+    job_name = "outcome_evaluator"
+    logger.info(json.dumps({"event": "job_start", "job": job_name}))
+    _runtime_start(job_name, total=1)
+
+    try:
+        async with async_session_factory() as db:
+            if not await _is_job_enabled(db, job_name):
+                logger.info(json.dumps({"event": "job_skipped", "job": job_name, "reason": "disabled"}))
+                _runtime_finish(job_name, "skipped", processed=0, total=1, message="Disabled")
+                return
+
+            summary = await evaluate_pending_setups(
+                db, max_bars=settings.outcome_evaluation_max_bars
+            )
+
+        _runtime_progress(job_name, processed=1, total=1)
+        _runtime_finish(
+            job_name, "completed", processed=1, total=1,
+            message=f"Evaluated {summary['evaluated']}, pending {summary['still_pending']}",
+        )
+        logger.info(json.dumps({
+            "event": "job_complete",
+            "job": job_name,
+            "summary": summary,
+        }))
+    except Exception as exc:
+        _runtime_finish(job_name, "error", processed=0, total=1, message=str(exc))
+        logger.error(json.dumps({
+            "event": "job_error",
+            "job": job_name,
+            "error_type": type(exc).__name__,
+            "message": str(exc),
+        }))
+
+
 # ---------------------------------------------------------------------------
 # Job: Ticker Universe Sync
 # ---------------------------------------------------------------------------
@@ -804,6 +851,16 @@ def configure_scheduler() -> None:
        replace_existing=True,
    )

+    # Outcome Evaluator — nightly, after fresh OHLCV has been collected
+    scheduler.add_job(
+        evaluate_outcomes,
+        "interval",
+        hours=24,
+        id="outcome_evaluator",
+        name="Outcome Evaluator",
+        replace_existing=True,
+    )
+
    logger.info(
        json.dumps({
            "event": "scheduler_configured",
@@ -2,7 +2,7 @@

 from __future__ import annotations

-from datetime import datetime
+from datetime import date, datetime

 from pydantic import BaseModel, Field

@@ -44,4 +44,6 @@ class TradeSetupResponse(BaseModel):
    reasoning: str | None = None
    risk_level: str | None = None
    actual_outcome: str | None = None
+    outcome_date: date | None = None
+    evaluated_at: datetime | None = None
    recommendation_summary: RecommendationSummaryResponse | None = None
@@ -400,6 +400,7 @@ VALID_JOB_NAMES = {
    "fundamental_collector",
    "rr_scanner",
    "ticker_universe_sync",
+    "outcome_evaluator",
 }

 JOB_LABELS = {
@@ -408,6 +409,7 @@ JOB_LABELS = {
    "fundamental_collector": "Fundamental Collector",
    "rr_scanner": "R:R Scanner",
    "ticker_universe_sync": "Ticker Universe Sync",
+    "outcome_evaluator": "Outcome Evaluator",
 }


@@ -0,0 +1,222 @@
+"""Trade setup outcome evaluation service.
+
+Closes the feedback loop on R:R scanner setups: walks daily OHLCV bars
+after detection and records whether the stop or the target was hit first.
+
+Outcome semantics (entry is the close at detection time, i.e. market entry):
+  - target_hit: target reached before the stop
+  - stop_hit:   stop reached before the target
+  - ambiguous:  stop AND target both within the same daily bar — with daily
+                granularity the order is unknowable, counted as a loss in stats
+  - expired:    neither level hit within ``max_bars`` trading days
+  - (NULL):     not enough bars yet to decide — re-evaluated on the next run
+"""
+
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass
+from datetime import date, datetime, timezone
+
+from sqlalchemy import select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.models.ohlcv import OHLCVRecord
+from app.models.trade_setup import TradeSetup
+
+logger = logging.getLogger(__name__)
+
+OUTCOME_TARGET_HIT = "target_hit"
+OUTCOME_STOP_HIT = "stop_hit"
+OUTCOME_AMBIGUOUS = "ambiguous"
+OUTCOME_EXPIRED = "expired"
+
+DEFAULT_MAX_BARS = 30
+
+# Confidence buckets for the performance breakdown
+_CONFIDENCE_BUCKETS = [
+    ("<50%", 0.0, 50.0),
+    ("50-70%", 50.0, 70.0),
+    ("≥70%", 70.0, 100.01),
+]
+
+
+@dataclass(frozen=True)
+class Bar:
+    date: date
+    high: float
+    low: float
+
+
+def evaluate_setup_against_bars(
+    direction: str,
+    stop_loss: float,
+    target: float,
+    bars: list[Bar],
+    max_bars: int = DEFAULT_MAX_BARS,
+) -> tuple[str | None, date | None]:
+    """Determine a setup's outcome from daily bars strictly after detection.
+
+    Returns (outcome, outcome_date); (None, None) while still undecided.
+    """
+    for i, bar in enumerate(bars):
+        if i >= max_bars:
+            break
+        if direction == "long":
+            stop_hit = bar.low <= stop_loss
+            target_hit = bar.high >= target
+        else:
+            stop_hit = bar.high >= stop_loss
+            target_hit = bar.low <= target
+
+        if stop_hit and target_hit:
+            return OUTCOME_AMBIGUOUS, bar.date
+        if stop_hit:
+            return OUTCOME_STOP_HIT, bar.date
+        if target_hit:
+            return OUTCOME_TARGET_HIT, bar.date
+
+    if len(bars) >= max_bars:
+        return OUTCOME_EXPIRED, bars[max_bars - 1].date
+
+    return None, None
+
+
+async def evaluate_pending_setups(
+    db: AsyncSession,
+    max_bars: int = DEFAULT_MAX_BARS,
+) -> dict[str, int]:
+    """Evaluate all unevaluated trade setups against stored OHLCV data.
+
+    Bars are fetched once per ticker. Setups that cannot be decided yet
+    remain NULL and are picked up on the next run.
+    """
+    result = await db.execute(
+        select(TradeSetup).where(TradeSetup.actual_outcome.is_(None))
+    )
+    pending = list(result.scalars().all())
+
+    summary = {"evaluated": 0, "still_pending": 0, "by_outcome": {}}
+    if not pending:
+        return summary
+
+    by_ticker: dict[int, list[TradeSetup]] = {}
+    for setup in pending:
+        by_ticker.setdefault(setup.ticker_id, []).append(setup)
+
+    now = datetime.now(timezone.utc)
+
+    for ticker_id, setups in by_ticker.items():
+        earliest = min(s.detected_at for s in setups).date()
+        bars_result = await db.execute(
+            select(OHLCVRecord)
+            .where(
+                OHLCVRecord.ticker_id == ticker_id,
+                OHLCVRecord.date > earliest,
+            )
+            .order_by(OHLCVRecord.date.asc())
+        )
+        records = list(bars_result.scalars().all())
+        all_bars = [Bar(date=r.date, high=r.high, low=r.low) for r in records]
+
+        for setup in setups:
+            detected_date = setup.detected_at.date()
+            bars = [b for b in all_bars if b.date > detected_date]
+            outcome, outcome_date = evaluate_setup_against_bars(
+                setup.direction, setup.stop_loss, setup.target, bars, max_bars
+            )
+            if outcome is None:
+                summary["still_pending"] += 1
+                continue
+            setup.actual_outcome = outcome
+            setup.outcome_date = outcome_date
+            setup.evaluated_at = now
+            summary["evaluated"] += 1
+            summary["by_outcome"][outcome] = summary["by_outcome"].get(outcome, 0) + 1
+
+    await db.commit()
+    return summary
+
+
+def _realized_r(setup: TradeSetup) -> float | None:
+    """Realized result in R-multiples: win = +rr_ratio, loss = -1R, expired = 0R."""
+    if setup.actual_outcome == OUTCOME_TARGET_HIT:
+        return setup.rr_ratio
+    if setup.actual_outcome in (OUTCOME_STOP_HIT, OUTCOME_AMBIGUOUS):
+        return -1.0
+    if setup.actual_outcome == OUTCOME_EXPIRED:
+        return 0.0
+    return None
+
+
+def _bucket_stats(setups: list[TradeSetup]) -> dict:
+    wins = sum(1 for s in setups if s.actual_outcome == OUTCOME_TARGET_HIT)
+    losses = sum(
+        1 for s in setups if s.actual_outcome in (OUTCOME_STOP_HIT, OUTCOME_AMBIGUOUS)
+    )
+    expired = sum(1 for s in setups if s.actual_outcome == OUTCOME_EXPIRED)
+    decided = wins + losses
+    realized = [r for s in setups if (r := _realized_r(s)) is not None]
+
+    return {
+        "total": len(setups),
+        "wins": wins,
+        "losses": losses,
+        "expired": expired,
+        "hit_rate": round(wins / decided * 100, 1) if decided else None,
+        "avg_r": round(sum(realized) / len(realized), 3) if realized else None,
+        "total_r": round(sum(realized), 2) if realized else None,
+    }
+
+
+def _confidence_bucket(score: float | None) -> str | None:
+    if score is None:
+        return None
+    for label, lo, hi in _CONFIDENCE_BUCKETS:
+        if lo <= score < hi:
+            return label
+    return None
+
+
+async def get_performance_stats(db: AsyncSession) -> dict:
+    """Aggregate outcome statistics over all evaluated trade setups.
+
+    avg_r is the expectancy per trade in R-multiples (win = +rr_ratio,
+    loss = -1R, expired = 0R). A positive avg_r means the signals have
+    been profitable on a risk-adjusted basis.
+    """
+    result = await db.execute(
+        select(TradeSetup).where(TradeSetup.actual_outcome.is_not(None))
+    )
+    evaluated = list(result.scalars().all())
+
+    pending_result = await db.execute(
+        select(TradeSetup.id).where(TradeSetup.actual_outcome.is_(None))
+    )
+    pending_count = len(pending_result.scalars().all())
+
+    by_direction: dict[str, list[TradeSetup]] = {}
+    by_action: dict[str, list[TradeSetup]] = {}
+    by_confidence: dict[str, list[TradeSetup]] = {}
+
+    for setup in evaluated:
+        by_direction.setdefault(setup.direction, []).append(setup)
+        action = setup.recommended_action or "NONE"
+        by_action.setdefault(action, []).append(setup)
+        bucket = _confidence_bucket(setup.confidence_score)
+        if bucket is not None:
+            by_confidence.setdefault(bucket, []).append(setup)
+
+    bucket_order = [label for label, _, _ in _CONFIDENCE_BUCKETS]
+
+    return {
+        "overall": _bucket_stats(evaluated),
+        "pending": pending_count,
+        "by_direction": {k: _bucket_stats(v) for k, v in sorted(by_direction.items())},
+        "by_action": {k: _bucket_stats(v) for k, v in sorted(by_action.items())},
+        "by_confidence": {
+            label: _bucket_stats(by_confidence[label])
+            for label in bucket_order
+            if label in by_confidence
+        },
+    }
@@ -351,4 +351,6 @@ def _trade_setup_to_dict(setup: TradeSetup, symbol: str) -> dict:
        "reasoning": setup.reasoning,
        "risk_level": setup.risk_level,
        "actual_outcome": setup.actual_outcome,
+        "outcome_date": setup.outcome_date,
+        "evaluated_at": setup.evaluated_at,
    }