deepen OHLCV history + make the factor-IC pass honest about overlap/regime

Two changes so the cross-sectional signal results can actually be trusted. (a) History depth — the binding constraint. Ingestion defaulted to 365 days, so long-lookback factors (12-month momentum, 52-week high) were only computable on a handful of weeks at the tail, and every IC reflected a single market regime. - New `settings.ohlcv_history_days` (default 1825 ≈ 5y); new tickers backfill this far instead of 1 year. - New manual "data_backfill" job (Admin → Jobs) re-fetches the full window for every ticker, ignoring incremental resume — run once to deepen existing 1-year histories. Idempotent (upsert); resumes after rate limits. (b) Factor-IC honesty. The IC was averaged over weekly rebalances whose 30-day forward windows overlap, inflating the t-stat ~sqrt(6)x. - IC now measured on NON-OVERLAPPING windows (weeks thinned to ~HORIZON apart). - Each signal carries a `reliable` flag (>= 12 independent windows); BacktestPanel greys out and de-stars thin signals so a lucky 9-week IC of 0.3 can't masquerade as an edge. 332 backend tests pass; frontend build clean. No migration (config + job + an added JSON field on the cached backtest report). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-23 18:20:59 +02:00
parent 402025692a
commit 099846513b
9 changed files with 148 additions and 38 deletions
@@ -67,6 +67,12 @@ class Settings(BaseSettings):
    # Outcome evaluation: trading days before an undecided setup expires
    outcome_evaluation_max_bars: int = 30
    # OHLCV history depth to fetch. New tickers backfill this far; the manual
    # "data_backfill" job re-fetches the full window for everyone. ~5 years so
    # long-lookback factors (12-month momentum, 52-week high) and multi-regime
    # backtests become computable. ~252 trading days/year.
    ohlcv_history_days: int = 1825
    # Database Pool
    db_pool_size: int = 5
    db_pool_timeout: int = 30
@@ -65,6 +65,7 @@ scheduler = AsyncIOScheduler(
 # Track last successful ticker per job for rate-limit resume
 _last_successful: dict[str, str | None] = {
    "data_collector": None,
    "data_backfill": None,
    "sentiment_collector": None,
    "fundamental_collector": None,
 }
@@ -81,6 +82,17 @@ _job_runtime: dict[str, dict[str, object]] = {
        "finished_at": None,
        "message": None,
    },
    "data_backfill": {
        "running": False,
        "status": "idle",
        "processed": 0,
        "total": None,
        "progress_pct": None,
        "current_ticker": None,
        "started_at": None,
        "finished_at": None,
        "message": None,
    },
    "sentiment_collector": {
        "running": False,
        "status": "idle",
@@ -392,16 +404,20 @@ def _chunked(symbols: list[str], chunk_size: int) -> list[list[str]]:
 # ---------------------------------------------------------------------------
-async def collect_ohlcv() -> None:
+async def collect_ohlcv(full_backfill: bool = False, job_name: str = "data_collector") -> None:
    """Fetch latest daily OHLCV for all tracked tickers.
    Uses AlpacaOHLCVProvider. Processes each ticker independently.
    On rate limit, records last successful ticker for resume.
        Start date is resolved by ingestion progress:
            - existing ticker: resume from last_ingested_date + 1
-            - new ticker: backfill ~1 year by default
+            - new ticker: backfill the configured history window
    ``full_backfill`` forces every ticker to re-fetch the full
    ``settings.ohlcv_history_days`` window (ignoring incremental resume) — used by
    the manual data_backfill job to deepen shallow histories. ``job_name`` lets the
    backfill report its own runtime/resume state separate from data_collector.
    """
    job_name = "data_collector"
    logger.info(json.dumps({"event": "job_start", "job": job_name}))
    _runtime_start(job_name)
    processed = 0
@@ -437,13 +453,18 @@ async def collect_ohlcv() -> None:
            return
        end_date = date.today()
        # Full backfill: pass an explicit start_date so fetch_and_ingest re-pulls
        # the whole window instead of resuming from the last stored bar.
        backfill_start = (
            end_date - timedelta(days=settings.ohlcv_history_days) if full_backfill else None
        )
        for symbol in symbols:
            _runtime_progress(job_name, processed=processed, total=total, current_ticker=symbol)
            async with async_session_factory() as db:
                try:
                    result = await ingestion_service.fetch_and_ingest(
-                        db, provider, symbol, start_date=None, end_date=end_date,
+                        db, provider, symbol, start_date=backfill_start, end_date=end_date,
                    )
                    _last_successful[job_name] = symbol
                    processed += 1
@@ -477,6 +498,17 @@ async def collect_ohlcv() -> None:
        _runtime_finish(job_name, "error", processed=processed, total=total, message=str(exc))
 async def backfill_ohlcv() -> None:
    """Deep historical backfill: re-fetch the full ``settings.ohlcv_history_days``
    window for every ticker, ignoring incremental resume.
    Manual/triggered job (Admin → Jobs). Run once to deepen the ~1-year histories
    so long-lookback factors (12-month momentum, 52-week high) and multi-regime
    backtests become computable. Idempotent (upsert); resumes after rate limits.
    """
    await collect_ohlcv(full_backfill=True, job_name="data_backfill")
 # ---------------------------------------------------------------------------
 # Job: Sentiment Collector
 # ---------------------------------------------------------------------------
@@ -1227,6 +1259,13 @@ def configure_scheduler(schedule_config: dict[str, str] | None = None) -> None:
        run_backtest_job, "interval", hours=168,
        id="backtest", name="Backtest", replace_existing=True,
    )
    # Deep history backfill: manual only (never auto-fires); triggered from
    # Admin → Jobs when histories need deepening.
    scheduler.add_job(
        backfill_ohlcv, "interval", weeks=520,
        id="data_backfill", name="Data Backfill (deep history)",
        replace_existing=True, next_run_time=None,
    )
    logger.info(
        json.dumps({
@@ -538,6 +538,7 @@ async def get_pipeline_readiness(db: AsyncSession) -> list[dict]:
 VALID_JOB_NAMES = {
    "data_collector",
    "data_backfill",
    "sentiment_collector",
    "fundamental_collector",
    "rr_scanner",
@@ -552,6 +553,7 @@ VALID_JOB_NAMES = {
 JOB_LABELS = {
    "data_collector": "Data Collector (OHLCV)",
    "data_backfill": "Data Backfill (deep history)",
    "sentiment_collector": "Sentiment Collector",
    "fundamental_collector": "Fundamental Collector",
    "rr_scanner": "R:R Scanner",
@@ -79,7 +79,8 @@ _CAL_BUCKETS = [(0, 20), (20, 40), (40, 60), (60, 80), (80, 100.01)]
 # ranking stocks by this signal sort tomorrow's winners from losers. This is the
 # test the per-setup hit-rate report can't do: it measures predictive power of a
 # signal, not the outcome of a target/stop structure built on top of one.
-MIN_CROSS_SECTION = 20   # min tickers present in a week to score that week
+MIN_CROSS_SECTION = 20    # min tickers present in a week to score that week
 MIN_RELIABLE_PERIODS = 12 # min non-overlapping windows before a signal's IC is trusted
 def _wrap_levels(level_dicts: list[dict]) -> list[Any]:
@@ -407,26 +408,53 @@ def _quintile_spread(pairs: list[tuple[float, float]]) -> float | None:
    return sum(p[1] for p in top) / k - sum(p[1] for p in bottom) / k
 def _week_ordinal(week_key: tuple[int, int]) -> int:
    """Monotonic absolute week number from an (ISO year, ISO week) key."""
    year, week = week_key
    return year * 53 + week
 def _nonoverlapping_weeks(
    week_keys: list[tuple[int, int]], stride: int
 ) -> list[tuple[int, int]]:
    """Thin to weeks at least ``stride`` apart so their forward windows don't
    overlap — greedy earliest-first. Removes the autocorrelation that would
    otherwise inflate the IC t-stat across adjacent weekly rebalances."""
    kept: list[tuple[int, int]] = []
    last: int | None = None
    for wk in sorted(week_keys, key=_week_ordinal):
        o = _week_ordinal(wk)
        if last is None or o - last >= stride:
            kept.append(wk)
            last = o
    return kept
 def _signal_evaluation(collected: dict) -> list[dict]:
    """Per-signal factor diagnostics, one row per candidate signal:
-      mean_ic               average weekly rank-IC (Spearman of signal vs fwd ret)
+      mean_ic               average rank-IC (Spearman of signal vs fwd ret)
      ic_t_stat             mean_ic / stderr — is the IC reliably non-zero?
-      ic_positive_pct       share of weeks the IC is positive (consistency)
+      ic_positive_pct       share of windows the IC is positive (consistency)
      mean_quintile_spread  avg top-minus-bottom-quintile forward return
      reliable              True once there are >= MIN_RELIABLE_PERIODS windows
-    A signal with no edge lands near IC 0 and spread 0. Caveat: weekly rebalances
+    IC is measured on NON-OVERLAPPING forward windows (weeks thinned to ~HORIZON
-    with a HORIZON-day forward window overlap, so the t-stat overstates
+    apart) so the t-stat isn't inflated by autocorrelation. A signal with no edge
-    significance — read it as directional, alongside ic_positive_pct.
+    lands near IC 0 / spread 0; one with too few independent windows is flagged
    unreliable rather than trusted on a lucky handful.
    """
    stride = max(1, round(HORIZON / 5))  # ISO weeks spanned by the forward window
    rows: list[dict] = []
    for name in sorted(collected):
        weeks_map = collected[name]
        usable = [wk for wk, recs in weeks_map.items() if len(recs) >= MIN_CROSS_SECTION]
        kept = _nonoverlapping_weeks(usable, stride)
        ics: list[float] = []
        spreads: list[float] = []
        sizes: list[int] = []
-        for recs in collected[name].values():
+        for wk in kept:
-            if len(recs) < MIN_CROSS_SECTION:
+            recs = weeks_map[wk]
                continue
            ic = _spearman([r[0] for r in recs], [r[1] for r in recs])
            if ic is not None:
                ics.append(ic)
@@ -450,6 +478,7 @@ def _signal_evaluation(collected: dict) -> list[dict]:
            "ic_t_stat": round(t_stat, 2) if t_stat is not None else None,
            "ic_positive_pct": round(sum(1 for x in ics if x > 0) / len(ics) * 100, 1),
            "mean_quintile_spread": round(sum(spreads) / len(spreads), 4) if spreads else None,
            "reliable": len(ics) >= MIN_RELIABLE_PERIODS,
        })
    rows.sort(key=lambda r: r["mean_ic"], reverse=True)
    return rows
@@ -518,12 +547,13 @@ async def run_backtest(
        "signal_eval": _signal_evaluation(collected),
        "signal_eval_note": (
            "Cross-sectional rank-IC of price-only signals vs the forward "
-            f"{HORIZON}-day return (weekly rebalance, min {MIN_CROSS_SECTION} "
+            f"{HORIZON}-day return (min {MIN_CROSS_SECTION} names/window). |IC| ≳ "
-            "names/week). |IC| ≳ 0.03 with a consistent sign is a real (if small) "
+            "0.03 with a consistent sign is a real (if small) edge; near 0 means "
-            "edge; near 0 means ranking on it sorts nothing. Momentum factors and "
+            "ranking on it sorts nothing. Momentum factors and high_52w are expected "
-            "high_52w are expected positive; reversal_1m and vol_6m are expected "
+            "positive; reversal_1m and vol_6m expected negative (mean-reversion / "
-            "negative (mean-reversion / low-vol anomaly). Overlapping windows inflate "
+            "low-vol anomaly). IC is measured on non-overlapping windows; signals "
-            "the t-stat — read directionally."
+            f"with fewer than {MIN_RELIABLE_PERIODS} independent windows are flagged "
            "unreliable (too few regimes — deepen history with the Data Backfill job)."
        ),
        "note": (
            "Sentiment & fundamentals held neutral (no point-in-time history). "
@@ -12,6 +12,7 @@ from datetime import date, timedelta
 from sqlalchemy import func, select
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.config import settings
 from app.exceptions import NotFoundError, ProviderError, RateLimitError
 from app.models.ohlcv import OHLCVRecord
 from app.models.settings import IngestionProgress
@@ -92,20 +93,23 @@ async def fetch_and_ingest(
    if end_date is None:
        end_date = date.today()
-    # Resolve start_date: use progress resume or default to 1 year ago.
+    # Resolve start_date: use progress resume or backfill the configured history
-    # If we have too little history, force a one-year backfill even if
+    # window. If we have too little history, force a full backfill even if
-    # ingestion progress exists (upsert makes this safe and idempotent).
+    # ingestion progress exists (upsert makes this safe and idempotent). A caller
    # that passes an explicit start_date (e.g. the manual deep-backfill job)
    # bypasses this entirely.
    if start_date is None:
        progress = await _get_progress(db, ticker.id)
        bar_count = await _get_ohlcv_bar_count(db, ticker.id)
        minimum_backfill_bars = 200
        backfill_start = end_date - timedelta(days=settings.ohlcv_history_days)
        if bar_count < minimum_backfill_bars:
-            start_date = end_date - timedelta(days=365)
+            start_date = backfill_start
        elif progress is not None:
            start_date = progress.last_ingested_date + timedelta(days=1)
        else:
-            start_date = end_date - timedelta(days=365)
+            start_date = backfill_start
    # If start > end, nothing to fetch
    if start_date > end_date:
@@ -277,11 +277,12 @@ export function BacktestPanel() {
                </p>
                <p className="mb-2 text-[11px] text-gray-500">
                  Does ranking the universe by a signal predict the forward {report.params.horizon_days}-day
-                  return? Mean IC is the rank correlation between signal and return, averaged over weekly
+                  return? Mean IC is the rank correlation between signal and return, averaged over
-                  rebalances. <span className="text-emerald-400">|IC| ≳ {IC_EDGE_THRESHOLD}</span> with a
+                  non-overlapping windows. <span className="text-emerald-400">|IC| ≳ {IC_EDGE_THRESHOLD}</span> with a
                  consistent sign (high IC&gt;0 %) is a real, if small, edge; near 0 means it sorts nothing.
                  Momentum skips the last month; <em>reversal_1m is expected negative</em> if the universe
-                  mean-reverts. Q5−Q1 is the top-minus-bottom-quintile forward return.
+                  mean-reverts. Q5−Q1 is the top-minus-bottom-quintile forward return. <span className="text-gray-600">Greyed
                  rows have too few independent windows to trust — deepen history via the Data Backfill job.</span>
                </p>
                <div className="glass overflow-x-auto">
                  <table className="w-full text-sm">
@@ -298,9 +299,15 @@ export function BacktestPanel() {
                    </thead>
                    <tbody>
                      {report.signal_eval.map((row) => {
-                        const edge = Math.abs(row.mean_ic) >= IC_EDGE_THRESHOLD;
+                        // Only trust the edge highlight when the IC rests on enough
                        // independent windows; thin signals are dimmed, not starred.
                        const edge = row.reliable && Math.abs(row.mean_ic) >= IC_EDGE_THRESHOLD;
                        return (
-                          <tr key={row.signal} className={`border-b border-white/[0.04] ${edge ? 'bg-emerald-400/[0.06]' : ''}`}>
+                          <tr
                            key={row.signal}
                            className={`border-b border-white/[0.04] ${edge ? 'bg-emerald-400/[0.06]' : ''} ${row.reliable ? '' : 'opacity-40'}`}
                            title={row.reliable ? undefined : `Only ${row.weeks} independent window(s) — not enough to trust`}
                          >
                            <td className="px-4 py-2.5 font-medium text-gray-200">
                              {edge && <span className="mr-1 text-emerald-300">★</span>}
                              {SIGNAL_LABELS[row.signal] ?? row.signal}
@@ -232,6 +232,7 @@ export interface BacktestSignalEvalRow {
  ic_t_stat: number | null;
  ic_positive_pct: number;
  mean_quintile_spread: number | null;
  reliable: boolean;
 }
 export interface BacktestReport {
@@ -79,6 +79,7 @@ class TestConfigureScheduler:
        job_ids = {j.id for j in jobs}
        assert job_ids == {
            "data_collector",
            "data_backfill",
            "sentiment_collector",
            "fundamental_collector",
            "rr_scanner",
@@ -103,6 +104,7 @@ class TestConfigureScheduler:
            "daily_pipeline",
            "intraday_pipeline",
            "data_collector",
            "data_backfill",
            "fundamental_collector",
            "market_regime",
            "outcome_evaluator",
@@ -94,18 +94,17 @@ def _records(closes: list[float]) -> list[SimpleNamespace]:
 def test_signal_evaluation_separates_edge_from_noise():
    rng = random.Random(42)
-    # Build a synthetic cross-section directly: 30 weeks, 40 names each.
+    # 120 consecutive weeks, 40 names each. After non-overlapping thinning
-    # "edge" perfectly orders the forward return; "noise" is independent of it.
+    # (stride = HORIZON/5 = 6) that leaves 20 independent windows — above the
-    collected: dict = {
+    # reliability bar. "edge" perfectly orders the forward return; "noise" is
-        "edge": {},
+    # independent of it.
-        "noise": {},
+    collected: dict = {"edge": {}, "noise": {}}
-    }
+    for week in range(120):
    for week in range(30):
        edge_recs = []
        noise_recs = []
        for _ in range(40):
            fwd = rng.gauss(0, 0.05)
-            edge_recs.append((fwd, fwd))             # signal == fwd → IC = 1
+            edge_recs.append((fwd, fwd))               # signal == fwd → IC = 1
            noise_recs.append((rng.gauss(0, 1), fwd))  # signal ⟂ fwd → IC ≈ 0
        collected["edge"][(2020, week)] = edge_recs
        collected["noise"][(2020, week)] = noise_recs
@@ -113,13 +112,33 @@ def test_signal_evaluation_separates_edge_from_noise():
    rows = {r["signal"]: r for r in bt._signal_evaluation(collected)}
    assert rows["edge"]["mean_ic"] == 1.0
    assert rows["edge"]["weeks"] == 20             # 120 weeks thinned to non-overlapping
    assert rows["edge"]["reliable"] is True
    assert rows["edge"]["ic_positive_pct"] == 100.0
    assert rows["edge"]["mean_quintile_spread"] > 0
-    assert abs(rows["noise"]["mean_ic"]) < 0.15      # indistinguishable from zero
+    assert abs(rows["noise"]["mean_ic"]) < 0.15    # indistinguishable from zero
    # Rows are sorted by mean_ic descending: the real signal ranks first.
    assert bt._signal_evaluation(collected)[0]["signal"] == "edge"
 def test_signal_evaluation_flags_too_few_windows_unreliable():
    # 5 adjacent weeks collapse to a single non-overlapping window → unreliable.
    collected: dict = {
        "edge": {(2020, w): [(float(i), float(i)) for i in range(40)] for w in range(5)}
    }
    row = bt._signal_evaluation(collected)[0]
    assert row["weeks"] == 1
    assert row["reliable"] is False
 def test_nonoverlapping_weeks_thins_by_stride():
    weeks = [(2020, w) for w in range(1, 13)]  # 12 consecutive ISO weeks
    kept = bt._nonoverlapping_weeks(weeks, stride=6)
    assert kept == [(2020, 1), (2020, 7)]      # 6 apart, no overlap
    # Stride 1 keeps everything; ordering is chronological.
    assert bt._nonoverlapping_weeks(list(reversed(weeks)), stride=1) == weeks
 def test_signal_evaluation_skips_thin_weeks():
    # A week with fewer than MIN_CROSS_SECTION names is ignored entirely.
    collected: dict = {"edge": {(2020, 1): [(1.0, 1.0)] * (bt.MIN_CROSS_SECTION - 1)}}