{
  "version": "v0",
  "built_at_utc": "2026-04-25T02:16:34Z",
  "benchmark": "LongMemEval-S (LME-S 500-Q)",
  "judge": "OpenAI gpt-4o (K=5 seeds {42,1,2,3,4}, 3-of-5 majority vote)",
  "frozen_retrieval_contract": {
    "first_stage": "BGE-large-en-v1.5 ∪ QNDN v0 ∪ BM25",
    "fusion": "Reciprocal Rank Fusion (RRF k=60)",
    "rerank": "MixK-trained cross-encoder",
    "delivered": "top-10 chunks to reader",
    "R@5": 0.962,
    "R@5_n": "481/500 questions",
    "reader_gap": "32.6 pp e2e upside available given retrieval contract"
  },
  "tracks": {
    "canonical": {
      "label": "Canonical",
      "desc": "The number Warrant stands behind publicly. Production-equivalent (oracle-qtype disclosed; learned classifier expected within 1–3 pp)."
    },
    "experimental": {
      "label": "Experimental",
      "desc": "Routed or prompt variants we have measured under the same protocol. Allowed on the board, but not the public headline."
    },
    "retrieval-only": {
      "label": "Retrieval-only",
      "desc": "Stack handed straight to the reader. No qtype routing, no full-context fallback. The contribution of retrieval-without-routing."
    },
    "no-retrieval": {
      "label": "No retrieval",
      "desc": "Reader receives the full LME-S 110 K-token haystack and zero retrieved chunks. Floor row; the contribution of retrieval is measured against it."
    },
    "open-swap": {
      "label": "Open swap",
      "desc": "Same canonical pipeline, different open-weights reader. Quantifies how much of the result is reader-specific."
    },
    "closed-reference": {
      "label": "Closed reference",
      "desc": "Closed-weights API reader on the identical retrieval contract. Reference comparison only — not a head-to-head; reproducibility is limited to whatever the API serves on the run date."
    }
  },
  "rows": [
    {
      "reader": "Gemma-4-26B-A4B-it",
      "model": "google/gemma-4-26B-A4B-it",
      "pipeline": "Hybrid + F3-on-TR  (oracle qtype + date-anchor on TR slice)",
      "params": "26B (A4B mixture-of-experts)",
      "notes": "Best routed variant we have measured. Adds a date-anchor pre-prompt on the temporal-reasoning slice only (F3). Allowed but not the public headline; +1.4 pp over Canonical, McNemar p ≈ 0.38 — directional, not yet decisive.",
      "track": "experimental",
      "n_answers": 500,
      "n_seeds": 5,
      "threshold": 3,
      "overall_n": 500,
      "overall_k": 357,
      "overall_acc": 0.714,
      "ci_lo": 0.6728760546154563,
      "ci_hi": 0.7518606086533169,
      "refusal_rate": 0.108,
      "per_qtype": {
        "single-session-user": {
          "k": 62,
          "n": 70,
          "acc": 0.8857142857142857
        },
        "multi-session": {
          "k": 92,
          "n": 133,
          "acc": 0.6917293233082706
        },
        "knowledge-update": {
          "k": 58,
          "n": 78,
          "acc": 0.7435897435897436
        },
        "single-session-preference": {
          "k": 8,
          "n": 30,
          "acc": 0.26666666666666666
        },
        "temporal-reasoning": {
          "k": 85,
          "n": 133,
          "acc": 0.6390977443609023
        },
        "single-session-assistant": {
          "k": 52,
          "n": 56,
          "acc": 0.9285714285714286
        }
      }
    },
    {
      "reader": "Gemma-4-26B-A4B-it",
      "model": "google/gemma-4-26B-A4B-it",
      "pipeline": "Hybrid  (oracle qtype: SSA→Naked, else→Stack)",
      "params": "26B (A4B MoE)",
      "notes": "The number Warrant publicly stands behind. Pure routing over the existing Stack and Naked answers; no extra inference. Production-equivalent uses a learned qtype classifier in place of oracle (pending; expected 1–3 pp gap).",
      "track": "canonical",
      "n_answers": 500,
      "n_seeds": 5,
      "threshold": 3,
      "overall_n": 500,
      "overall_k": 350,
      "overall_acc": 0.7,
      "ci_lo": 0.6584314090816449,
      "ci_hi": 0.7385187435059938,
      "refusal_rate": 0.1,
      "per_qtype": {
        "single-session-user": {
          "k": 62,
          "n": 70,
          "acc": 0.8857142857142857
        },
        "multi-session": {
          "k": 93,
          "n": 133,
          "acc": 0.6992481203007519
        },
        "knowledge-update": {
          "k": 57,
          "n": 78,
          "acc": 0.7307692307692307
        },
        "single-session-preference": {
          "k": 10,
          "n": 30,
          "acc": 0.3333333333333333
        },
        "temporal-reasoning": {
          "k": 75,
          "n": 133,
          "acc": 0.5639097744360902
        },
        "single-session-assistant": {
          "k": 53,
          "n": 56,
          "acc": 0.9464285714285714
        }
      }
    },
    {
      "reader": "Qwen3.6-27B",
      "model": "Qwen/Qwen3.6-27B",
      "pipeline": "Hybrid  (oracle qtype: SSA→Naked, else→Stack)",
      "params": "27B dense",
      "notes": "Same canonical pipeline, different open-weights reader. Phase 91.8 swap. The 3.8 pp gap to the Gemma Canonical is paid mostly on the SSA branch (Qwen's 110K reading is weaker).",
      "track": "open-swap",
      "n_answers": 500,
      "n_seeds": 5,
      "threshold": 3,
      "overall_n": 500,
      "overall_k": 331,
      "overall_acc": 0.662,
      "ci_lo": 0.6194419387868491,
      "ci_hi": 0.7020876848091384,
      "refusal_rate": 0.148,
      "per_qtype": {
        "single-session-user": {
          "k": 62,
          "n": 70,
          "acc": 0.8857142857142857
        },
        "multi-session": {
          "k": 86,
          "n": 133,
          "acc": 0.6466165413533834
        },
        "knowledge-update": {
          "k": 56,
          "n": 78,
          "acc": 0.717948717948718
        },
        "single-session-preference": {
          "k": 12,
          "n": 30,
          "acc": 0.4
        },
        "temporal-reasoning": {
          "k": 61,
          "n": 133,
          "acc": 0.45864661654135336
        },
        "single-session-assistant": {
          "k": 54,
          "n": 56,
          "acc": 0.9642857142857143
        }
      }
    },
    {
      "reader": "Gemma-4-26B-A4B-it",
      "model": "google/gemma-4-26B-A4B-it",
      "pipeline": "Stack  (BGE∪QNDN∪BM25 → RRF → top-10)",
      "params": "26B (A4B MoE)",
      "notes": "Stack only, no full-context fallback on the SSA slice. The +6.2 pp the Canonical Hybrid earns over this row is the routing contribution; the rest is the reader.",
      "track": "retrieval-only",
      "n_answers": 500,
      "n_seeds": 5,
      "threshold": 3,
      "overall_n": 500,
      "overall_k": 319,
      "overall_acc": 0.638,
      "ci_lo": 0.5949709219022378,
      "ci_hi": 0.6789246833832329,
      "refusal_rate": 0.144,
      "per_qtype": {
        "single-session-user": {
          "k": 62,
          "n": 70,
          "acc": 0.8857142857142857
        },
        "multi-session": {
          "k": 92,
          "n": 133,
          "acc": 0.6917293233082706
        },
        "single-session-preference": {
          "k": 10,
          "n": 30,
          "acc": 0.3333333333333333
        },
        "temporal-reasoning": {
          "k": 74,
          "n": 133,
          "acc": 0.556390977443609
        },
        "knowledge-update": {
          "k": 59,
          "n": 78,
          "acc": 0.7564102564102564
        },
        "single-session-assistant": {
          "k": 22,
          "n": 56,
          "acc": 0.39285714285714285
        }
      }
    },
    {
      "reader": "Qwen3.6-27B",
      "model": "Qwen/Qwen3.6-27B",
      "pipeline": "Stack  (same retrieval contract)",
      "params": "27B dense",
      "notes": "Reader swap, Stack-only. Higher refusal rate than Gemma at the same evidence — Qwen declines more readily when the top-10 chunks under-cover the question.",
      "track": "retrieval-only",
      "n_answers": 500,
      "n_seeds": 5,
      "threshold": 3,
      "overall_n": 500,
      "overall_k": 300,
      "overall_acc": 0.6,
      "ci_lo": 0.5564533144716549,
      "ci_hi": 0.6420217618221643,
      "refusal_rate": 0.212,
      "per_qtype": {
        "single-session-user": {
          "k": 63,
          "n": 70,
          "acc": 0.9
        },
        "multi-session": {
          "k": 85,
          "n": 133,
          "acc": 0.6390977443609023
        },
        "single-session-preference": {
          "k": 12,
          "n": 30,
          "acc": 0.4
        },
        "temporal-reasoning": {
          "k": 62,
          "n": 133,
          "acc": 0.46616541353383456
        },
        "knowledge-update": {
          "k": 57,
          "n": 78,
          "acc": 0.7307692307692307
        },
        "single-session-assistant": {
          "k": 21,
          "n": 56,
          "acc": 0.375
        }
      }
    },
    {
      "reader": "gpt-5-mini",
      "model": "openai/gpt-5-mini-2026-04-14",
      "pipeline": "Same retrieval contract, closed-weights reader",
      "params": "≈ proprietary",
      "notes": "Closed-weights API reader on the identical retrieval contract. Reference row for cross-class comparison; reproducibility is limited to whatever the API serves on the run date.",
      "track": "closed-reference",
      "n_answers": 500,
      "n_seeds": 5,
      "threshold": 3,
      "overall_n": 500,
      "overall_k": 295,
      "overall_acc": 0.59,
      "ci_lo": 0.5463618796646929,
      "ci_hi": 0.6322656889997446,
      "refusal_rate": 0.25,
      "per_qtype": {
        "single-session-user": {
          "k": 63,
          "n": 70,
          "acc": 0.9
        },
        "multi-session": {
          "k": 69,
          "n": 133,
          "acc": 0.518796992481203
        },
        "single-session-preference": {
          "k": 11,
          "n": 30,
          "acc": 0.36666666666666664
        },
        "temporal-reasoning": {
          "k": 79,
          "n": 133,
          "acc": 0.5939849624060151
        },
        "knowledge-update": {
          "k": 52,
          "n": 78,
          "acc": 0.6666666666666666
        },
        "single-session-assistant": {
          "k": 21,
          "n": 56,
          "acc": 0.375
        }
      }
    },
    {
      "reader": "Qwen3.6-27B",
      "model": "Qwen/Qwen3.6-27B",
      "pipeline": "Naked  (no retrieval)",
      "params": "27B dense",
      "notes": "No-retrieval floor for the open-swap track.",
      "track": "no-retrieval",
      "n_answers": 500,
      "n_seeds": 5,
      "threshold": 3,
      "overall_n": 500,
      "overall_k": 292,
      "overall_acc": 0.584,
      "ci_lo": 0.5403157691552084,
      "ci_hi": 0.6264032949315997,
      "refusal_rate": 0.388,
      "per_qtype": {
        "single-session-user": {
          "k": 64,
          "n": 70,
          "acc": 0.9142857142857143
        },
        "multi-session": {
          "k": 62,
          "n": 133,
          "acc": 0.46616541353383456
        },
        "single-session-preference": {
          "k": 5,
          "n": 30,
          "acc": 0.16666666666666666
        },
        "temporal-reasoning": {
          "k": 48,
          "n": 133,
          "acc": 0.3609022556390977
        },
        "knowledge-update": {
          "k": 59,
          "n": 78,
          "acc": 0.7564102564102564
        },
        "single-session-assistant": {
          "k": 54,
          "n": 56,
          "acc": 0.9642857142857143
        }
      }
    },
    {
      "reader": "Gemma-4-26B-A4B-it",
      "model": "google/gemma-4-26B-A4B-it",
      "pipeline": "CWFIX  (Stack + F1 refusal-retry + F2 chronological-KU + F3 date-anchor, all qtypes)",
      "params": "26B (A4B MoE)",
      "notes": "Cheap-win-fix bundle applied unconditionally. Regresses vs Canonical (F1 and F2 are harmful when ungated). Published here to keep the negative result auditable; F3 alone, scoped to TR, is the only keeper.",
      "track": "experimental",
      "n_answers": 500,
      "n_seeds": 5,
      "threshold": 3,
      "overall_n": 500,
      "overall_k": 283,
      "overall_acc": 0.566,
      "ci_lo": 0.522216352008653,
      "ci_hi": 0.6087771983452676,
      "refusal_rate": 0.204,
      "per_qtype": {
        "single-session-user": {
          "k": 61,
          "n": 70,
          "acc": 0.8714285714285714
        },
        "multi-session": {
          "k": 66,
          "n": 133,
          "acc": 0.49624060150375937
        },
        "single-session-preference": {
          "k": 6,
          "n": 30,
          "acc": 0.2
        },
        "temporal-reasoning": {
          "k": 83,
          "n": 133,
          "acc": 0.6240601503759399
        },
        "knowledge-update": {
          "k": 50,
          "n": 78,
          "acc": 0.6410256410256411
        },
        "single-session-assistant": {
          "k": 17,
          "n": 56,
          "acc": 0.30357142857142855
        }
      }
    },
    {
      "reader": "Gemma-4-26B-A4B-it",
      "model": "google/gemma-4-26B-A4B-it",
      "pipeline": "Naked  (no retrieval, LME-S 110 K-token haystack)",
      "params": "26B (A4B MoE)",
      "notes": "Floor row: reader receives the full LME-S haystack and no retrieval at all. The +10.6 pp Stack earns over this row is the retrieval contribution. Run on 8×A100 (≈140 GB); H200/H100 OOMs without quantisation.",
      "track": "no-retrieval",
      "n_answers": 500,
      "n_seeds": 5,
      "threshold": 3,
      "overall_n": 500,
      "overall_k": 266,
      "overall_acc": 0.532,
      "ci_lo": 0.48818530469160204,
      "ci_hi": 0.5753267197224201,
      "refusal_rate": 0.392,
      "per_qtype": {
        "single-session-user": {
          "k": 62,
          "n": 70,
          "acc": 0.8857142857142857
        },
        "multi-session": {
          "k": 51,
          "n": 133,
          "acc": 0.38345864661654133
        },
        "single-session-preference": {
          "k": 4,
          "n": 30,
          "acc": 0.13333333333333333
        },
        "temporal-reasoning": {
          "k": 45,
          "n": 133,
          "acc": 0.3383458646616541
        },
        "knowledge-update": {
          "k": 51,
          "n": 78,
          "acc": 0.6538461538461539
        },
        "single-session-assistant": {
          "k": 53,
          "n": 56,
          "acc": 0.9464285714285714
        }
      }
    }
  ],
  "headline_canonical": {
    "reader": "Gemma-4-26B-A4B-it",
    "model": "google/gemma-4-26B-A4B-it",
    "pipeline": "Hybrid  (oracle qtype: SSA→Naked, else→Stack)",
    "params": "26B (A4B MoE)",
    "notes": "The number Warrant publicly stands behind. Pure routing over the existing Stack and Naked answers; no extra inference. Production-equivalent uses a learned qtype classifier in place of oracle (pending; expected 1–3 pp gap).",
    "track": "canonical",
    "n_answers": 500,
    "n_seeds": 5,
    "threshold": 3,
    "overall_n": 500,
    "overall_k": 350,
    "overall_acc": 0.7,
    "ci_lo": 0.6584314090816449,
    "ci_hi": 0.7385187435059938,
    "refusal_rate": 0.1,
    "per_qtype": {
      "single-session-user": {
        "k": 62,
        "n": 70,
        "acc": 0.8857142857142857
      },
      "multi-session": {
        "k": 93,
        "n": 133,
        "acc": 0.6992481203007519
      },
      "knowledge-update": {
        "k": 57,
        "n": 78,
        "acc": 0.7307692307692307
      },
      "single-session-preference": {
        "k": 10,
        "n": 30,
        "acc": 0.3333333333333333
      },
      "temporal-reasoning": {
        "k": 75,
        "n": 133,
        "acc": 0.5639097744360902
      },
      "single-session-assistant": {
        "k": 53,
        "n": 56,
        "acc": 0.9464285714285714
      }
    }
  }
}