{
  "schema_version": "v1",
  "snapshot_label": "MRCRv2 vs Reader Leaderboard — 5-row overlap delta",
  "phase": "96-T.3.3",
  "rendered_section_id": "external-cross-check",
  "source": {
    "name": "BenchLM",
    "url": "https://benchlm.ai/benchmarks/mrcrv2",
    "captured_at_utc": "2026-04-24T00:00:00Z",
    "status": "display_only",
    "status_note": "BenchLM is tracking MRCRv2 in its local dataset, but exact-source verification records for these rows are still being attached. Until exact-source attachments are completed they should not be treated as fully verified public benchmark rows. We will re-run the correlation when verified MRCRv2 numbers ship."
  },
  "config": {
    "reader_leaderboard_snapshot": "Warrant export, 27 Apr 2026 (K=10 baseline, locked anchor row)",
    "k_to_reader": 10,
    "frozen_retrieval_contract": "BGE-large-en-v1.5 \u222a QNDN v0 \u222a BM25 \u2192 RRF (k=60) \u2192 MixK rerank",
    "lme_s_questions": 500,
    "judge": "OpenAI gpt-4o, K=5 seeds {42,1,2,3,4}, 3-of-5 majority vote"
  },
  "rows": [
    {
      "model": "Gemma-4-26B-A4B-it",
      "track": "open-weights",
      "mrcrv2_pct": 44.1,
      "reader_best_pct": 71.4,
      "reader_best_pipeline": "Hybrid + F3-on-TR",
      "delta_pp": 27.3,
      "population": "positive_delta",
      "interpretation": "Mediocre at native long-context, excellent reader given clean evidence."
    },
    {
      "model": "GPT-OSS-120B @ high",
      "track": "open-weights",
      "mrcrv2_pct": 59.0,
      "reader_best_pct": 73.4,
      "reader_best_pipeline": "Open-Max (Stack)",
      "delta_pp": 14.4,
      "population": "positive_delta",
      "interpretation": "Same direction as Gemma; weaker magnitude."
    },
    {
      "model": "Claude Opus 4.6",
      "track": "closed-API",
      "mrcrv2_pct": 76.0,
      "reader_best_pct": 75.6,
      "reader_best_pipeline": "fixed-retrieval reference",
      "delta_pp": -0.4,
      "population": "convergence_point",
      "interpretation": "The only overlapping model where the two benchmarks numerically agree."
    },
    {
      "model": "GPT-5 mini",
      "track": "closed-API",
      "mrcrv2_pct": 79.0,
      "reader_best_pct": 59.0,
      "reader_best_pipeline": "fixed-retrieval reference",
      "delta_pp": -20.0,
      "population": "negative_delta",
      "interpretation": "Strong at native long-context; mediocre reader on fixed evidence."
    },
    {
      "model": "GPT-5.4",
      "track": "closed-API",
      "mrcrv2_pct": 97.0,
      "reader_best_pct": 71.2,
      "reader_best_pipeline": "fixed-retrieval reference",
      "delta_pp": -25.8,
      "population": "negative_delta",
      "interpretation": "Largest inverted gap. The cleanest single refutation of \"MRCRv2 ranking == Reader Leaderboard ranking\"."
    }
  ],
  "summary": {
    "best_row_pearson": 0.0,
    "stack_canonical_pearson": 0.33,
    "stack_canonical_spearman": 0.22,
    "open_weight_pearson": 0.5,
    "open_weight_n_models": 2,
    "two_population_structure": true,
    "negative_delta_models": ["GPT-5.4", "GPT-5 mini"],
    "positive_delta_models": ["Gemma-4-26B-A4B-it", "GPT-OSS-120B @ high"],
    "convergence_point": "Claude Opus 4.6",
    "interpretation": "Once you control for raw model size / strength, MRCRv2 and the Reader Leaderboard are measuring different bottlenecks. The weak correlation is the result, not a problem with the result."
  },
  "falsifiability_note": "Re-run the correlation when BenchLM ships verified MRCRv2 numbers. If the directional pattern survives, T.3.3 stands. If individual row magnitudes wobble such that the two-population structure breaks, journal a Phase 96-T.3.3.bis entry and re-frame."
}
