deepen OHLCV history + make the factor-IC pass honest about overlap/regime
Two changes so the cross-sectional signal results can actually be trusted. (a) History depth — the binding constraint. Ingestion defaulted to 365 days, so long-lookback factors (12-month momentum, 52-week high) were only computable on a handful of weeks at the tail, and every IC reflected a single market regime. - New `settings.ohlcv_history_days` (default 1825 ≈ 5y); new tickers backfill this far instead of 1 year. - New manual "data_backfill" job (Admin → Jobs) re-fetches the full window for every ticker, ignoring incremental resume — run once to deepen existing 1-year histories. Idempotent (upsert); resumes after rate limits. (b) Factor-IC honesty. The IC was averaged over weekly rebalances whose 30-day forward windows overlap, inflating the t-stat ~sqrt(6)x. - IC now measured on NON-OVERLAPPING windows (weeks thinned to ~HORIZON apart). - Each signal carries a `reliable` flag (>= 12 independent windows); BacktestPanel greys out and de-stars thin signals so a lucky 9-week IC of 0.3 can't masquerade as an edge. 332 backend tests pass; frontend build clean. No migration (config + job + an added JSON field on the cached backtest report). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
+43
-4
@@ -65,6 +65,7 @@ scheduler = AsyncIOScheduler(
|
||||
# Track last successful ticker per job for rate-limit resume
|
||||
_last_successful: dict[str, str | None] = {
|
||||
"data_collector": None,
|
||||
"data_backfill": None,
|
||||
"sentiment_collector": None,
|
||||
"fundamental_collector": None,
|
||||
}
|
||||
@@ -81,6 +82,17 @@ _job_runtime: dict[str, dict[str, object]] = {
|
||||
"finished_at": None,
|
||||
"message": None,
|
||||
},
|
||||
"data_backfill": {
|
||||
"running": False,
|
||||
"status": "idle",
|
||||
"processed": 0,
|
||||
"total": None,
|
||||
"progress_pct": None,
|
||||
"current_ticker": None,
|
||||
"started_at": None,
|
||||
"finished_at": None,
|
||||
"message": None,
|
||||
},
|
||||
"sentiment_collector": {
|
||||
"running": False,
|
||||
"status": "idle",
|
||||
@@ -392,16 +404,20 @@ def _chunked(symbols: list[str], chunk_size: int) -> list[list[str]]:
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def collect_ohlcv() -> None:
|
||||
async def collect_ohlcv(full_backfill: bool = False, job_name: str = "data_collector") -> None:
|
||||
"""Fetch latest daily OHLCV for all tracked tickers.
|
||||
|
||||
Uses AlpacaOHLCVProvider. Processes each ticker independently.
|
||||
On rate limit, records last successful ticker for resume.
|
||||
Start date is resolved by ingestion progress:
|
||||
- existing ticker: resume from last_ingested_date + 1
|
||||
- new ticker: backfill ~1 year by default
|
||||
- new ticker: backfill the configured history window
|
||||
|
||||
``full_backfill`` forces every ticker to re-fetch the full
|
||||
``settings.ohlcv_history_days`` window (ignoring incremental resume) — used by
|
||||
the manual data_backfill job to deepen shallow histories. ``job_name`` lets the
|
||||
backfill report its own runtime/resume state separate from data_collector.
|
||||
"""
|
||||
job_name = "data_collector"
|
||||
logger.info(json.dumps({"event": "job_start", "job": job_name}))
|
||||
_runtime_start(job_name)
|
||||
processed = 0
|
||||
@@ -437,13 +453,18 @@ async def collect_ohlcv() -> None:
|
||||
return
|
||||
|
||||
end_date = date.today()
|
||||
# Full backfill: pass an explicit start_date so fetch_and_ingest re-pulls
|
||||
# the whole window instead of resuming from the last stored bar.
|
||||
backfill_start = (
|
||||
end_date - timedelta(days=settings.ohlcv_history_days) if full_backfill else None
|
||||
)
|
||||
|
||||
for symbol in symbols:
|
||||
_runtime_progress(job_name, processed=processed, total=total, current_ticker=symbol)
|
||||
async with async_session_factory() as db:
|
||||
try:
|
||||
result = await ingestion_service.fetch_and_ingest(
|
||||
db, provider, symbol, start_date=None, end_date=end_date,
|
||||
db, provider, symbol, start_date=backfill_start, end_date=end_date,
|
||||
)
|
||||
_last_successful[job_name] = symbol
|
||||
processed += 1
|
||||
@@ -477,6 +498,17 @@ async def collect_ohlcv() -> None:
|
||||
_runtime_finish(job_name, "error", processed=processed, total=total, message=str(exc))
|
||||
|
||||
|
||||
async def backfill_ohlcv() -> None:
|
||||
"""Deep historical backfill: re-fetch the full ``settings.ohlcv_history_days``
|
||||
window for every ticker, ignoring incremental resume.
|
||||
|
||||
Manual/triggered job (Admin → Jobs). Run once to deepen the ~1-year histories
|
||||
so long-lookback factors (12-month momentum, 52-week high) and multi-regime
|
||||
backtests become computable. Idempotent (upsert); resumes after rate limits.
|
||||
"""
|
||||
await collect_ohlcv(full_backfill=True, job_name="data_backfill")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Job: Sentiment Collector
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -1227,6 +1259,13 @@ def configure_scheduler(schedule_config: dict[str, str] | None = None) -> None:
|
||||
run_backtest_job, "interval", hours=168,
|
||||
id="backtest", name="Backtest", replace_existing=True,
|
||||
)
|
||||
# Deep history backfill: manual only (never auto-fires); triggered from
|
||||
# Admin → Jobs when histories need deepening.
|
||||
scheduler.add_job(
|
||||
backfill_ohlcv, "interval", weeks=520,
|
||||
id="data_backfill", name="Data Backfill (deep history)",
|
||||
replace_existing=True, next_run_time=None,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
json.dumps({
|
||||
|
||||
Reference in New Issue
Block a user