{
  "bench": "TONE Bench",
  "bench_cn": "声律榜",
  "round": "R1",
  "round_label": "首轮",
  "test_date": "2026-07-03",
  "publisher": "SVOIC 诗韵中心",
  "question_count": 80,
  "question_split": { "flawed": 40, "clean": 40 },
  "gold_standard": "确定性规则引擎（poemrank/gelv），与搜韵在 2496 首真实律绝上 0 分歧对齐",
  "metric": {
    "primary": "七类型二元判定 F1 的宏平均（综合F1）",
    "secondary": "平仄错字字级 F1（错字F1）",
    "note": "多音字设为不计分位，报之或漏之两头不罚；错字F1 不计入综合分。"
  },
  "types": ["平仄有误", "用韵", "孤平", "三平尾", "三仄尾", "失粘", "重字"],
  "small_sample_note": "孤平、三平尾、三仄尾、失粘 四列在本期题库中仅 1–2 例，为小样本，分项 F1 波动较大，仅供参考。",
  "valid_threshold": "有效题 = 模型返回可解析 JSON 判定的题数；< 全量视为部分有效，14/80 判为成绩无效。",
  "results": [
    { "rank": 1, "slug": "claude-fable-5", "model": "Claude Fable 5", "vendor": "Anthropic", "openrouter_id": "anthropic/claude-fable-5", "macro_f1": 0.8536, "char_f1": 0.7843, "valid": 80, "total": 80, "status": "valid", "type_f1": { "平仄有误": 0.9091, "用韵": 0.963, "孤平": 1.0, "三平尾": 0.8, "三仄尾": 1.0, "失粘": 0.3333, "重字": 0.9697 } },
    { "rank": 2, "slug": "qwen3.7-max", "model": "Qwen3.7-Max", "vendor": "阿里 · 通义千问", "openrouter_id": "qwen/qwen3.7-max", "macro_f1": 0.8088, "char_f1": 0.5556, "valid": 80, "total": 80, "status": "valid", "type_f1": { "平仄有误": 0.8235, "用韵": 0.7879, "孤平": 1.0, "三平尾": 0.8, "三仄尾": 1.0, "失粘": 0.25, "重字": 1.0 } },
    { "rank": 3, "slug": "deepseek-v4-pro", "model": "DeepSeek V4 Pro", "vendor": "深度求索", "openrouter_id": "deepseek/deepseek-v4-pro", "macro_f1": 0.6788, "char_f1": 0.6222, "valid": 76, "total": 80, "status": "valid", "type_f1": { "平仄有误": 0.963, "用韵": 0.5217, "孤平": 0.0, "三平尾": 1.0, "三仄尾": 1.0, "失粘": 0.3333, "重字": 0.9333 } },
    { "rank": 4, "slug": "gpt-5.5", "model": "GPT-5.5", "vendor": "OpenAI", "openrouter_id": "openai/gpt-5.5", "macro_f1": 0.6343, "char_f1": 0.9048, "valid": 76, "total": 80, "status": "valid", "type_f1": { "平仄有误": 0.8276, "用韵": 0.8125, "孤平": 0.6667, "三平尾": 0.8, "三仄尾": 0.0, "失粘": 0.3333, "重字": 1.0 } },
    { "rank": 5, "slug": "glm-5.2", "model": "GLM-5.2", "vendor": "智谱 AI", "openrouter_id": "z-ai/glm-5.2", "macro_f1": 0.5699, "char_f1": 0.2, "valid": 67, "total": 80, "status": "valid", "type_f1": { "平仄有误": 0.7333, "用韵": 0.72, "孤平": 0.0, "三平尾": 0.6667, "三仄尾": 1.0, "失粘": 0.0, "重字": 0.8696 } },
    { "rank": 6, "slug": "gemini-3.1-pro", "model": "Gemini 3.1 Pro", "vendor": "Google", "openrouter_id": "google/gemini-3.1-pro-preview", "macro_f1": 0.5519, "char_f1": 0.3103, "valid": 66, "total": 80, "status": "valid", "type_f1": { "平仄有误": 0.6667, "用韵": 0.7333, "孤平": 1.0, "三平尾": 0.5, "三仄尾": 0.0, "失粘": 0.0, "重字": 0.963 } },
    { "rank": 7, "slug": "claude-opus-4.8", "model": "Claude Opus 4.8", "vendor": "Anthropic", "openrouter_id": "anthropic/claude-opus-4.8", "macro_f1": 0.4516, "char_f1": 0.1161, "valid": 80, "total": 80, "status": "valid", "type_f1": { "平仄有误": 0.3939, "用韵": 0.8462, "孤平": 0.0, "三平尾": 0.4444, "三仄尾": 0.6667, "失粘": 0.1, "重字": 0.7097 } },
    { "rank": 8, "slug": "grok-4.3", "model": "Grok-4.3", "vendor": "xAI", "openrouter_id": "x-ai/grok-4.3", "macro_f1": 0.3089, "char_f1": 0.0423, "valid": 80, "total": 80, "status": "valid", "type_f1": { "平仄有误": 0.4308, "用韵": 0.3478, "孤平": 0.0, "三平尾": 0.0, "三仄尾": 0.6667, "失粘": 0.0769, "重字": 0.64 } },
    { "rank": 9, "slug": "qwen3-30b-本机", "model": "Qwen3-30B（本机基线）", "vendor": "本地部署 · 非上榜对象", "openrouter_id": "LOCAL", "macro_f1": 0.1279, "char_f1": 0.016, "valid": 80, "total": 80, "status": "valid", "type_f1": { "平仄有误": 0.4783, "用韵": 0.2353, "孤平": 0.0, "三平尾": 0.1818, "三仄尾": 0.0, "失粘": 0.0, "重字": 0.0 } },
    { "rank": null, "slug": "kimi-k2.6", "model": "Kimi K2.6", "vendor": "月之暗面 Moonshot", "openrouter_id": "moonshotai/kimi-k2.6", "macro_f1": 0.7262, "char_f1": 0.4286, "valid": 14, "total": 80, "status": "invalid", "invalid_reason": "仅 14/80 题返回有效判定（调用超时），成绩无效，待补测。", "type_f1": { "平仄有误": 0.75, "用韵": 1.0, "孤平": 1.0, "三平尾": 0.6667, "三仄尾": 0.0, "失粘": 0.6667, "重字": 1.0 } }
  ]
}
