{
  "schema_version": "1.0",
  "service": "codex-reset-radar",
  "type": "model_iq_check",
  "subset": {
    "id": "deepswe-13-v1",
    "name": "DeepSWE 13-task probe v1",
    "size": 13,
    "selection_policy": "Fixed low-cost mixed pass/fail subset calibrated from the 2026-05-30 113-task Codex GPT-5.5 xhigh run.",
    "tasks": [
      {
        "task_name": "ytt-jsonpath-query-api",
        "language": "go",
        "baseline_passed": true,
        "baseline_cost_usd": 2.62,
        "role": "stable_pass",
        "title": "Add JSONPath query APIs to orderedmap and Starlark modules"
      },
      {
        "task_name": "participle-grammar-conflict-analysis",
        "language": "go",
        "baseline_passed": false,
        "baseline_cost_usd": 3.16,
        "role": "boundary_fail",
        "title": "Add build-time grammar conflict analysis to participle"
      },
      {
        "task_name": "abs-module-cache-flags",
        "language": "go",
        "baseline_passed": true,
        "baseline_cost_usd": 4.67,
        "role": "stable_pass",
        "title": "Harden module loading, cache introspection, and script flags"
      },
      {
        "task_name": "httpx-multipart-response-parsing",
        "language": "python",
        "baseline_passed": true,
        "baseline_cost_usd": 3.02,
        "role": "stable_pass",
        "title": "Add multipart response parsing to HTTPX"
      },
      {
        "task_name": "bandit-incremental-cache-control",
        "language": "python",
        "baseline_passed": false,
        "baseline_cost_usd": 4.12,
        "role": "boundary_fail",
        "title": "Add incremental cache controls to Bandit"
      },
      {
        "task_name": "ipython-session-bundle-replay",
        "language": "python",
        "baseline_passed": true,
        "baseline_cost_usd": 4.58,
        "role": "stable_pass",
        "title": "Add session bundle recording and replay to IPython"
      },
      {
        "task_name": "ofetch-per-origin-circuit-breaker",
        "language": "typescript",
        "baseline_passed": true,
        "baseline_cost_usd": 2.78,
        "role": "stable_pass",
        "title": "Add a per-origin circuit breaker to ofetch"
      },
      {
        "task_name": "obsidian-linter-link-format-conversion",
        "language": "typescript",
        "baseline_passed": false,
        "baseline_cost_usd": 2.79,
        "role": "boundary_fail",
        "title": "Add link format conversion between wiki and markdown syntax"
      },
      {
        "task_name": "kea-atomic-signal-selectors",
        "language": "typescript",
        "baseline_passed": true,
        "baseline_cost_usd": 5.59,
        "role": "stable_pass",
        "title": "Add atomic signal selectors to Kea"
      },
      {
        "task_name": "ink-grid-box-layout",
        "language": "typescript",
        "baseline_passed": true,
        "baseline_cost_usd": 5.6,
        "role": "stable_pass",
        "title": "Add CSS Grid layout to the Box component"
      },
      {
        "task_name": "csstree-shorthand-expansion-compression",
        "language": "javascript",
        "baseline_passed": true,
        "baseline_cost_usd": 3.82,
        "role": "stable_pass",
        "title": "Add shorthand expansion and compression to the lexer"
      },
      {
        "task_name": "fd-deterministic-multi-key-sorting",
        "language": "rust",
        "baseline_passed": false,
        "baseline_cost_usd": 4.26,
        "role": "boundary_fail",
        "title": "Add deterministic multi-key sorting to fd"
      },
      {
        "task_name": "oxvg-structural-selector-preservation",
        "language": "rust",
        "baseline_passed": true,
        "baseline_cost_usd": 8.49,
        "role": "hard_pass",
        "title": "Preserve structure needed by stylesheet selectors"
      }
    ]
  },
  "baseline": {
    "full_run": {
      "tasks": 113,
      "valid_tasks": 113,
      "passed": 75,
      "failed": 38,
      "pass_rate": 0.663717,
      "cost_usd": 767.855315,
      "n_input_tokens": 1039855369,
      "n_cache_tokens": 1014737920,
      "n_output_tokens": 4496637,
      "n_agent_steps": 9160,
      "wall_seconds": 46195
    },
    "subset": {
      "date": "2026-05-30",
      "model": "gpt-5.5",
      "reasoning_effort": "xhigh",
      "tasks": 13,
      "passed": 9,
      "failed": 4,
      "pass_rate": 0.692308,
      "iq_score": 100.0,
      "estimated_cost_usd": 55.5
    }
  },
  "history": [
    {
      "date": "2026-05-30",
      "label": "GPT-5.5 xhigh full-run calibration filtered to DeepSWE 13",
      "source": "combined-113-codex-gpt55-xhigh-20260530",
      "model": "gpt-5.5",
      "reasoning_effort": "xhigh",
      "subset_id": "deepswe-13-v1",
      "tasks": 13,
      "valid_tasks": 13,
      "passed": 9,
      "failed": 4,
      "invalid": 0,
      "pass_rate": 0.692308,
      "baseline_pass_rate": 0.692308,
      "iq_score": 100.0,
      "status": "green",
      "cost_usd": 55.511044,
      "n_input_tokens": 63272018,
      "n_cache_tokens": 60988288,
      "n_output_tokens": 453275,
      "n_agent_steps": 725,
      "wall_seconds": 4073,
      "wall_time_basis": "scheduled_task_durations",
      "completion_concurrency": 4,
      "serial_task_seconds": 12593,
      "source_span_seconds": 39462,
      "task_results": [
        {
          "task_name": "ytt-jsonpath-query-api",
          "language": "go",
          "role": "stable_pass",
          "baseline_passed": true,
          "passed": true,
          "valid": true,
          "reward": 1,
          "exception_type": null,
          "cost_usd": 2.623482,
          "n_agent_steps": 31,
          "duration_seconds": 764
        },
        {
          "task_name": "participle-grammar-conflict-analysis",
          "language": "go",
          "role": "boundary_fail",
          "baseline_passed": false,
          "passed": false,
          "valid": true,
          "reward": 0,
          "exception_type": null,
          "cost_usd": 3.16346,
          "n_agent_steps": 47,
          "duration_seconds": 1074
        },
        {
          "task_name": "abs-module-cache-flags",
          "language": "go",
          "role": "stable_pass",
          "baseline_passed": true,
          "passed": true,
          "valid": true,
          "reward": 1,
          "exception_type": null,
          "cost_usd": 4.673595000000001,
          "n_agent_steps": 57,
          "duration_seconds": 1015
        },
        {
          "task_name": "httpx-multipart-response-parsing",
          "language": "python",
          "role": "stable_pass",
          "baseline_passed": true,
          "passed": true,
          "valid": true,
          "reward": 1,
          "exception_type": null,
          "cost_usd": 3.0185630000000003,
          "n_agent_steps": 39,
          "duration_seconds": 586
        },
        {
          "task_name": "bandit-incremental-cache-control",
          "language": "python",
          "role": "boundary_fail",
          "baseline_passed": false,
          "passed": false,
          "valid": true,
          "reward": 0,
          "exception_type": null,
          "cost_usd": 4.117954,
          "n_agent_steps": 52,
          "duration_seconds": 920
        },
        {
          "task_name": "ipython-session-bundle-replay",
          "language": "python",
          "role": "stable_pass",
          "baseline_passed": true,
          "passed": true,
          "valid": true,
          "reward": 1,
          "exception_type": null,
          "cost_usd": 4.584818,
          "n_agent_steps": 45,
          "duration_seconds": 1014
        },
        {
          "task_name": "ofetch-per-origin-circuit-breaker",
          "language": "typescript",
          "role": "stable_pass",
          "baseline_passed": true,
          "passed": true,
          "valid": true,
          "reward": 1,
          "exception_type": null,
          "cost_usd": 2.7796270000000005,
          "n_agent_steps": 40,
          "duration_seconds": 837
        },
        {
          "task_name": "obsidian-linter-link-format-conversion",
          "language": "typescript",
          "role": "boundary_fail",
          "baseline_passed": false,
          "passed": false,
          "valid": true,
          "reward": 0,
          "exception_type": null,
          "cost_usd": 2.7941079999999996,
          "n_agent_steps": 34,
          "duration_seconds": 672
        },
        {
          "task_name": "kea-atomic-signal-selectors",
          "language": "typescript",
          "role": "stable_pass",
          "baseline_passed": true,
          "passed": true,
          "valid": true,
          "reward": 1,
          "exception_type": null,
          "cost_usd": 5.586837,
          "n_agent_steps": 77,
          "duration_seconds": 1149
        },
        {
          "task_name": "ink-grid-box-layout",
          "language": "typescript",
          "role": "stable_pass",
          "baseline_passed": true,
          "passed": true,
          "valid": true,
          "reward": 1,
          "exception_type": null,
          "cost_usd": 5.59914,
          "n_agent_steps": 91,
          "duration_seconds": 1028
        },
        {
          "task_name": "csstree-shorthand-expansion-compression",
          "language": "javascript",
          "role": "stable_pass",
          "baseline_passed": true,
          "passed": true,
          "valid": true,
          "reward": 1,
          "exception_type": null,
          "cost_usd": 3.818159,
          "n_agent_steps": 50,
          "duration_seconds": 1212
        },
        {
          "task_name": "fd-deterministic-multi-key-sorting",
          "language": "rust",
          "role": "boundary_fail",
          "baseline_passed": false,
          "passed": false,
          "valid": true,
          "reward": 0,
          "exception_type": null,
          "cost_usd": 4.2603670000000005,
          "n_agent_steps": 58,
          "duration_seconds": 905
        },
        {
          "task_name": "oxvg-structural-selector-preservation",
          "language": "rust",
          "role": "hard_pass",
          "baseline_passed": true,
          "passed": true,
          "valid": true,
          "reward": 1,
          "exception_type": null,
          "cost_usd": 8.490934,
          "n_agent_steps": 104,
          "duration_seconds": 1419
        }
      ]
    }
  ],
  "latest": {
    "date": "2026-05-30",
    "label": "GPT-5.5 xhigh full-run calibration filtered to DeepSWE 13",
    "source": "combined-113-codex-gpt55-xhigh-20260530",
    "model": "gpt-5.5",
    "reasoning_effort": "xhigh",
    "subset_id": "deepswe-13-v1",
    "tasks": 13,
    "valid_tasks": 13,
    "passed": 9,
    "failed": 4,
    "invalid": 0,
    "pass_rate": 0.692308,
    "baseline_pass_rate": 0.692308,
    "iq_score": 100.0,
    "status": "green",
    "cost_usd": 55.511044,
    "n_input_tokens": 63272018,
    "n_cache_tokens": 60988288,
    "n_output_tokens": 453275,
    "n_agent_steps": 725,
    "wall_seconds": 4073,
    "wall_time_basis": "scheduled_task_durations",
    "completion_concurrency": 4,
    "serial_task_seconds": 12593,
    "source_span_seconds": 39462,
    "task_results": [
      {
        "task_name": "ytt-jsonpath-query-api",
        "language": "go",
        "role": "stable_pass",
        "baseline_passed": true,
        "passed": true,
        "valid": true,
        "reward": 1,
        "exception_type": null,
        "cost_usd": 2.623482,
        "n_agent_steps": 31,
        "duration_seconds": 764
      },
      {
        "task_name": "participle-grammar-conflict-analysis",
        "language": "go",
        "role": "boundary_fail",
        "baseline_passed": false,
        "passed": false,
        "valid": true,
        "reward": 0,
        "exception_type": null,
        "cost_usd": 3.16346,
        "n_agent_steps": 47,
        "duration_seconds": 1074
      },
      {
        "task_name": "abs-module-cache-flags",
        "language": "go",
        "role": "stable_pass",
        "baseline_passed": true,
        "passed": true,
        "valid": true,
        "reward": 1,
        "exception_type": null,
        "cost_usd": 4.673595000000001,
        "n_agent_steps": 57,
        "duration_seconds": 1015
      },
      {
        "task_name": "httpx-multipart-response-parsing",
        "language": "python",
        "role": "stable_pass",
        "baseline_passed": true,
        "passed": true,
        "valid": true,
        "reward": 1,
        "exception_type": null,
        "cost_usd": 3.0185630000000003,
        "n_agent_steps": 39,
        "duration_seconds": 586
      },
      {
        "task_name": "bandit-incremental-cache-control",
        "language": "python",
        "role": "boundary_fail",
        "baseline_passed": false,
        "passed": false,
        "valid": true,
        "reward": 0,
        "exception_type": null,
        "cost_usd": 4.117954,
        "n_agent_steps": 52,
        "duration_seconds": 920
      },
      {
        "task_name": "ipython-session-bundle-replay",
        "language": "python",
        "role": "stable_pass",
        "baseline_passed": true,
        "passed": true,
        "valid": true,
        "reward": 1,
        "exception_type": null,
        "cost_usd": 4.584818,
        "n_agent_steps": 45,
        "duration_seconds": 1014
      },
      {
        "task_name": "ofetch-per-origin-circuit-breaker",
        "language": "typescript",
        "role": "stable_pass",
        "baseline_passed": true,
        "passed": true,
        "valid": true,
        "reward": 1,
        "exception_type": null,
        "cost_usd": 2.7796270000000005,
        "n_agent_steps": 40,
        "duration_seconds": 837
      },
      {
        "task_name": "obsidian-linter-link-format-conversion",
        "language": "typescript",
        "role": "boundary_fail",
        "baseline_passed": false,
        "passed": false,
        "valid": true,
        "reward": 0,
        "exception_type": null,
        "cost_usd": 2.7941079999999996,
        "n_agent_steps": 34,
        "duration_seconds": 672
      },
      {
        "task_name": "kea-atomic-signal-selectors",
        "language": "typescript",
        "role": "stable_pass",
        "baseline_passed": true,
        "passed": true,
        "valid": true,
        "reward": 1,
        "exception_type": null,
        "cost_usd": 5.586837,
        "n_agent_steps": 77,
        "duration_seconds": 1149
      },
      {
        "task_name": "ink-grid-box-layout",
        "language": "typescript",
        "role": "stable_pass",
        "baseline_passed": true,
        "passed": true,
        "valid": true,
        "reward": 1,
        "exception_type": null,
        "cost_usd": 5.59914,
        "n_agent_steps": 91,
        "duration_seconds": 1028
      },
      {
        "task_name": "csstree-shorthand-expansion-compression",
        "language": "javascript",
        "role": "stable_pass",
        "baseline_passed": true,
        "passed": true,
        "valid": true,
        "reward": 1,
        "exception_type": null,
        "cost_usd": 3.818159,
        "n_agent_steps": 50,
        "duration_seconds": 1212
      },
      {
        "task_name": "fd-deterministic-multi-key-sorting",
        "language": "rust",
        "role": "boundary_fail",
        "baseline_passed": false,
        "passed": false,
        "valid": true,
        "reward": 0,
        "exception_type": null,
        "cost_usd": 4.2603670000000005,
        "n_agent_steps": 58,
        "duration_seconds": 905
      },
      {
        "task_name": "oxvg-structural-selector-preservation",
        "language": "rust",
        "role": "hard_pass",
        "baseline_passed": true,
        "passed": true,
        "valid": true,
        "reward": 1,
        "exception_type": null,
        "cost_usd": 8.490934,
        "n_agent_steps": 104,
        "duration_seconds": 1419
      }
    ]
  },
  "moving_average": {
    "iq_3d": 100.0,
    "iq_7d": 100.0
  },
  "updated_at": "2026-05-30T13:52:54.549487+08:00"
}
