AXL Research Log

#!/usr/bin/env python3
"""Build a structured research log from this repo's git history.

================================================================
SCOPE NOTE (load-bearing; read before applying this tool elsewhere)
================================================================

The role classifier below is specific to THIS REPO's authoring
conventions as established between 2026-04-10 and 2026-04-20:

  - "Codex R\\d+" / "codex r\\d+"    marks a formal Codex review round
  - "Codex <sha> review"             marks a review response to commit <sha>
  - "Codex <sha> follow-up"          marks a follow-up on review of commit <sha>
  - "Codex review of <sha>"          same, alternate phrasing
  - "Gap \\d+"                       marks a Claude Code substrate milestone
  - "ship:"                          marks an operator-approved shipping commit
  - "RESULT" in subject              marks a cold-read decision-gate result
  - "bench:"                         marks a benchmark measurement
  - "spec:"                          marks a spec-file change
  - "docs:"                          marks a documentation-only change

These patterns are NOT a general convention. Applied to another repo
with different authoring conventions, this script will produce
meaningless role classifications. It is a SCOPED tool for
reconstructing the dual-agent research iteration on this particular
repository.

The generated log's "deterministic function of history" property
holds under two assumptions:

  1. Future commits continue to explicitly name the SHA they respond
     to (e.g. "Codex a6785c2 follow-up"). If that discipline drops,
     the response-edge DAG loses edges. Commit hygiene constraint,
     not a script bug.
  2. The conventions listed above remain stable. If new review modes
     appear (e.g. operator-driven review rounds, external reviewers),
     the classifier will silently mis-label them as
     "claude-research-impl". New convention -> update classifier.

================================================================
Usage
================================================================

  python3 tools/build-research-log.py --format json
      -> prints research-log.json to stdout

  python3 tools/build-research-log.py --format html
      -> prints research-log.html to stdout

  python3 tools/build-research-log.py --write
      -> writes both docs/axlprotocol-org/research-log.json
         and docs/axlprotocol-org/research-log.html

  python3 tools/build-research-log.py --summary
      -> prints role counts and edge counts for spot-checking before
         publication. This is the self-check CC-OPS-AXLPROTOCOL
         requested in the Phase 2.5 reply.

Outputs are deterministic functions of the current HEAD's git log
on this repo. No LLM calls, no network, no hand-curation.
"""
from __future__ import annotations

import argparse
import html
import json
import re
import subprocess
import sys
from datetime import datetime, timezone
from pathlib import Path

REPO_ROOT = Path(__file__).resolve().parent.parent
VERSION = "1.0"


# ---------- Regex rules (scoped to this repo's conventions) ----------

# Short or full SHA in commit messages.
_SHA_RE = re.compile(r"\b([0-9a-f]{7,40})\b")

# Response-edge patterns: commit explicitly names the commit it responds to.
_RESPONSE_PATTERNS = [
    re.compile(r"Codex\s+([0-9a-f]{7,40})\s+review", re.IGNORECASE),
    re.compile(r"Codex\s+([0-9a-f]{7,40})\s+follow[- ]up", re.IGNORECASE),
    re.compile(r"Codex\s+review\s+of\s+([0-9a-f]{7,40})", re.IGNORECASE),
    re.compile(r"review\s+of\s+([0-9a-f]{7,40})", re.IGNORECASE),
    re.compile(r"follow[- ]up\s+(?:to|on)\s+([0-9a-f]{7,40})", re.IGNORECASE),
]

# Role classification rules. First match wins. Ordered by specificity.
_ROLE_RULES = [
    # Codex formal review rounds (R1-R5, or sub-labels like R4.1).
    # Also matches the long-form phrasing "round 1" used in 0a5cad4.
    (re.compile(
        r"\bCodex\s+R\d+|codex\s+r\d+|R\d+\s+findings|findings\s+from\s+Codex\s+R\d+|"
        r"Codex\s+.*?\bround\s+\d+",
        re.IGNORECASE), "codex-review-round"),
    # Codex review responses naming a target SHA (post-gate reviews).
    (re.compile(r"Codex\s+[0-9a-f]{7,40}\s+(?:review|follow[- ]up)|Codex\s+review\s+of\s+[0-9a-f]{7,40}",
                re.IGNORECASE), "codex-review-response"),
    # Catch-all "Codex review" without a SHA (e.g. "Codex review" amendments).
    (re.compile(r"Codex\s+review\b", re.IGNORECASE), "codex-review-response"),
    # Cold-read decision-gate results.
    (re.compile(r"\bRESULT\b"), "corpus-result"),
    # Operator-approved ships.
    (re.compile(r"^ship:", re.IGNORECASE), "ship"),
    # Commit categories via subject prefix.
    (re.compile(r"^spec:", re.IGNORECASE), "spec"),
    (re.compile(r"^bench:", re.IGNORECASE), "bench"),
    (re.compile(r"^docs:", re.IGNORECASE), "docs"),
    # Substrate milestones.
    (re.compile(r"\bGap\s+\d+", re.IGNORECASE), "substrate-gap"),
    # Cold-read kit builds and prompt/seed changes.
    (re.compile(r"cold[- ]read.*kit|seed|reconstruction|prompt", re.IGNORECASE), "gate-kit"),
]


def classify_role(subject: str, body: str) -> str:
    full = subject + "\n" + body
    for rule, label in _ROLE_RULES:
        if rule.search(full):
            return label
    return "claude-research-impl"


def extract_round(subject: str, body: str) -> str | None:
    """Extract formal review-round label (R1, R4, R4.1, ...) if present.

    Known limitations:
      - Sub-findings mentioned in a parent round's body (e.g. an "R4.1
        low finding" discussed inside the R4 commit's body) will not
        surface as separate round labels; the parent round wins on
        first-match. To make R4.1 show up as its own entry, it would
        need its own commit with R4.1 in the subject or body.
      - The "round 1" long-form phrasing in 0a5cad4 is handled via a
        dedicated pattern below; other long-form phrasings of later
        rounds may not be caught.
    """
    combined = subject + "\n" + body
    # Direct "R<n>" or "R<n>.<m>" form.
    m = re.search(r"\bCodex\s+R(\d+(?:\.\d+)?)", combined)
    if m:
        return "R" + m.group(1)
    m = re.search(r"codex\s+r(\d+(?:\.\d+)?)", combined, re.IGNORECASE)
    if m:
        return "R" + m.group(1)
    m = re.search(r"\bR(\d+(?:\.\d+)?)\s+findings", combined)
    if m:
        return "R" + m.group(1)
    # Long-form "round N" phrasing (matches 0a5cad4's "round 1").
    m = re.search(r"Codex\s+.*?\bround\s+(\d+)\b", combined, re.IGNORECASE)
    if m:
        return "R" + m.group(1)
    return None


def extract_responds_to(subject: str, body: str) -> str | None:
    """Extract the target SHA this commit responds to, if any.

    Returns the first SHA found in known response patterns, normalized
    to 7-character short form.
    """
    full = subject + "\n" + body
    for pat in _RESPONSE_PATTERNS:
        m = pat.search(full)
        if m:
            return m.group(1)[:7]
    return None


# Metric extraction patterns. Best-effort; structured claims in commit
# bodies that match these patterns surface as metric records. Claims
# stated in other forms do not surface but stay in the verbatim subject
# and body preserved per commit.
_METRIC_PATTERNS = [
    ("pct_transition", re.compile(r"(\d+(?:\.\d+)?)\s*%\s*(?:→|->|\bto\b)\s*(\d+(?:\.\d+)?)\s*%")),
    ("numeric_transition", re.compile(
        r"(?<!\d)(\d+(?:\.\d+)?)\s*(?:→|->)\s*(\d+(?:\.\d+)?)(?!\d)"
    )),
    ("test_count_pass", re.compile(r"\b(\d{2,4})\s*(?:tests?\s+pass|passed|/\d+\s+tests?)")),
    ("delta_recall", re.compile(r"Δ?recall\s*([+-]?\d+\.\d+)", re.IGNORECASE)),
    ("delta_precision", re.compile(r"Δ?precision\s*([+-]?\d+\.\d+)", re.IGNORECASE)),
    ("delta_f", re.compile(r"ΔF\s*([+-]?\d+\.\d+)")),
    ("compression_ratio", re.compile(r"(\d+(?:\.\d+)?)\s*[x×]\s*(?:chars?|tokens?|compression)", re.IGNORECASE)),
]


def extract_metrics(body: str) -> list[dict]:
    out = []
    for kind, pat in _METRIC_PATTERNS:
        for m in pat.finditer(body):
            out.append({
                "kind": kind,
                "text": m.group(0).strip(),
                "groups": [g for g in m.groups() if g is not None],
            })
    return out


# ---------- Git log walker ----------

_FIELD_SEP = "\x1f"
_RECORD_SEP = "\x1e"


def git_log_records() -> list[dict]:
    fmt = _FIELD_SEP.join(["%H", "%h", "%at", "%an", "%s", "%b"])
    output = subprocess.check_output(
        ["git", "log", "--reverse", f"--format={fmt}{_RECORD_SEP}"],
        cwd=REPO_ROOT, text=True,
    )
    records = []
    for block in output.split(_RECORD_SEP):
        block = block.strip("\n\r")
        if not block.strip():
            continue
        parts = block.split(_FIELD_SEP)
        if len(parts) < 6:
            continue
        full_sha, short_sha, ts, author, subject, body = parts[:6]
        records.append({
            "full_sha": full_sha,
            "sha": short_sha[:7],
            "date": datetime.fromtimestamp(int(ts), tz=timezone.utc).isoformat(),
            "date_short": datetime.fromtimestamp(int(ts), tz=timezone.utc).strftime("%Y-%m-%d"),
            "author": author,
            "subject": subject,
            "body": body.strip(),
        })
    return records


def head_sha() -> str:
    return subprocess.check_output(
        ["git", "rev-parse", "HEAD"], cwd=REPO_ROOT, text=True
    ).strip()[:7]


# ---------- Log assembly ----------


def build_log() -> dict:
    commits = git_log_records()
    entries = []
    for c in commits:
        role = classify_role(c["subject"], c["body"])
        round_label = extract_round(c["subject"], c["body"])
        responds_to = extract_responds_to(c["subject"], c["body"])
        metrics = extract_metrics(c["body"])
        entries.append({
            "sha": c["sha"],
            "full_sha": c["full_sha"],
            "date": c["date"],
            "date_short": c["date_short"],
            "subject": c["subject"],
            "role": role,
            "round": round_label,
            "responds_to": responds_to,
            "metrics": metrics,
        })

    role_counts: dict[str, int] = {}
    for e in entries:
        role_counts[e["role"]] = role_counts.get(e["role"], 0) + 1

    response_edges = sum(1 for e in entries if e["responds_to"] is not None)
    round_entries = sum(1 for e in entries if e["round"] is not None)

    return {
        "generator": "tools/build-research-log.py",
        "generator_version": VERSION,
        "generator_head": head_sha(),
        "generated_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
        "scope_note": (
            "The role classifier in this log is specific to this repo's "
            "authoring conventions (Codex <sha> review, codex r\\d+, "
            "Gap \\d+, ship:, bench:, RESULT, etc.). These patterns are "
            "not a general convention. Applied to another repo with "
            "different conventions, this script produces meaningless "
            "classifications. The deterministic property holds as long "
            "as future commits continue to name the SHA they respond "
            "to; if commit discipline drops, the DAG loses edges. "
            "Commit hygiene constraint, not a script bug."
        ),
        "commit_count": len(entries),
        "role_counts": role_counts,
        "response_edge_count": response_edges,
        "round_entry_count": round_entries,
        "commits": entries,
    }


# ---------- HTML emitter ----------


def render_html(log: dict) -> str:
    role_rows = "\n    ".join(
        f"<tr><td><code>{html.escape(role)}</code></td><td>{count}</td></tr>"
        for role, count in sorted(log["role_counts"].items(), key=lambda kv: -kv[1])
    )
    role_list = ", ".join(
        f"<code>{html.escape(r)}</code>"
        for r in sorted(log["role_counts"].keys())
    )

    sections = []
    for e in log["commits"]:
        responds_line = ""
        if e["responds_to"]:
            responds_line = (
                f'<p class="responds-to">Responds to: '
                f'<a href="#sha-{html.escape(e["responds_to"])}">'
                f'<code>{html.escape(e["responds_to"])}</code></a></p>'
            )

        round_line = ""
        if e["round"]:
            round_line = f'<p class="round"><strong>Round {html.escape(e["round"])}</strong></p>'

        metrics_block = ""
        if e["metrics"]:
            items = "\n      ".join(
                f'<li><code>{html.escape(m["kind"])}</code>: '
                f'<span class="metric-text">{html.escape(m["text"])}</span></li>'
                for m in e["metrics"]
            )
            metrics_block = (
                f'<details class="metrics">\n'
                f'    <summary>{len(e["metrics"])} metric(s) extracted</summary>\n'
                f'    <ul>\n      {items}\n    </ul>\n'
                f'  </details>'
            )

        sections.append(
            f'<section id="sha-{html.escape(e["sha"])}" class="commit role-{html.escape(e["role"])}">\n'
            f'  <h3>\n'
            f'    <a href="#sha-{html.escape(e["sha"])}" class="anchor"><code>{html.escape(e["sha"])}</code></a>\n'
            f'    <span class="role-badge">{html.escape(e["role"])}</span>\n'
            f'    <span class="date">{html.escape(e["date_short"])}</span>\n'
            f'  </h3>\n'
            f'  <p class="subject">{html.escape(e["subject"])}</p>\n'
            f'  {responds_line}\n'
            f'  {round_line}\n'
            f'  {metrics_block}\n'
            f'</section>'
        )

    commit_sections = "\n\n".join(sections)

    head = log["generator_head"]
    generated_at = log["generated_at"]
    commit_count = log["commit_count"]
    response_edge_count = log["response_edge_count"]
    round_entry_count = log["round_entry_count"]
    version = log["generator_version"]

    return f"""<h1>AXL Research Log</h1>

<p><em>A per-commit record of the dual-agent research iteration, generated deterministically from the git history at HEAD <code>{html.escape(head)}</code> on {html.escape(generated_at)}.</em></p>

<h2>How this was generated</h2>

<p>This page is the output of <code>tools/build-research-log.py v{html.escape(version)}</code>, a deterministic script that walks the repo's git history and classifies each commit by role using regex rules that match this repo's authoring conventions. It is not a hand-curated narrative. Running the same script against the same git HEAD produces the same output.</p>

<p><strong>Assumptions this property depends on:</strong></p>

<ol>
  <li><strong>Commit message discipline.</strong> The DAG of review-and-response edges is reconstructed from explicit SHA references in commit messages (patterns like "Codex <em>sha</em> review", "Codex <em>sha</em> follow-up", "Codex review of <em>sha</em>"). If future commits stop naming the SHA they respond to, the DAG loses edges. This is a commit-hygiene constraint, not a script bug.</li>
  <li><strong>Repo-scoped regex classifier.</strong> Role classification depends on this repo's authoring conventions: <code>codex r\\d+</code>, <code>Gap \\d+</code>, <code>ship:</code>, <code>bench:</code>, <code>spec:</code>, <code>docs:</code>, <code>RESULT</code>. Applying this script to another repo with different conventions produces meaningless classifications.</li>
  <li><strong>Metric extraction is best-effort.</strong> Numeric transitions in commit bodies are captured when they match known patterns (<code>NN.NN% -&gt; NN.NN%</code>, <code>NNN tests pass</code>, <code>Δrecall ±NN.NN</code>). Claims stated in other forms may not surface as structured metrics. The subject and body text are always preserved verbatim in the JSON so human readers can verify.</li>
</ol>

<p><strong>Known classifier limitations on this specific history:</strong></p>

<ul>
  <li>The R4.1 sub-finding (referenced in the R4 commit body <code>099dbe6</code>) does not appear as its own round-labeled entry because it does not have its own commit. Extraction picks the first round match in a commit, so R4 wins. To surface R4.1 as an independent entry, a future sub-finding would need its own commit with <code>R4.1</code> in the subject.</li>
  <li>Body-level mentions of round labels occasionally produce false-positive round assignments on commits that reference a round in narrative explanation without BEING that round. Verifiable by reading the subject in the entry below.</li>
  <li>The long-form phrasing "round 1" in commit <code>0a5cad4</code> is caught by a dedicated pattern; other long-form phrasings of later rounds (if any) may not be caught.</li>
</ul>

<p>Inputs: <code>git log --reverse --format=...</code> against this repo. Outputs: <code>research-log.json</code> (machine-readable, complete) and this HTML page.</p>

<h2>Summary</h2>

<table>
  <thead>
    <tr><th>Role</th><th>Commits</th></tr>
  </thead>
  <tbody>
    {role_rows}
  </tbody>
</table>

<p>Total commits: <strong>{commit_count}</strong>. Response edges (commits that name a target SHA): <strong>{response_edge_count}</strong>. Commits with a formal review-round label: <strong>{round_entry_count}</strong>.</p>

<h2>Commit log</h2>

{commit_sections}
"""


# ---------- CLI ----------


def main() -> int:
    ap = argparse.ArgumentParser(description=__doc__)
    ap.add_argument("--format", choices=["json", "html"], default=None,
                    help="emit JSON or HTML to stdout")
    ap.add_argument("--write", action="store_true",
                    help="write research-log.json and research-log.html into "
                         "docs/axlprotocol-org/")
    ap.add_argument("--summary", action="store_true",
                    help="print role counts and edge counts; useful for "
                         "spot-checking before publication")
    args = ap.parse_args()

    log = build_log()

    if args.summary:
        print(f"generator_head:     {log['generator_head']}")
        print(f"generated_at:       {log['generated_at']}")
        print(f"commit_count:       {log['commit_count']}")
        print(f"response_edges:     {log['response_edge_count']}")
        print(f"round_entries:      {log['round_entry_count']}")
        print()
        print("role_counts:")
        for role, n in sorted(log["role_counts"].items(), key=lambda kv: -kv[1]):
            print(f"  {role:28s}  {n}")
        return 0

    if args.write:
        out_dir = REPO_ROOT / "docs" / "axlprotocol-org"
        (out_dir / "research-log.json").write_text(
            json.dumps(log, indent=2, ensure_ascii=False) + "\n"
        )
        (out_dir / "research-log.html").write_text(render_html(log))
        print(f"wrote {out_dir / 'research-log.json'}")
        print(f"wrote {out_dir / 'research-log.html'}")
        return 0

    if args.format == "json":
        print(json.dumps(log, indent=2, ensure_ascii=False))
    elif args.format == "html":
        print(render_html(log))
    else:
        ap.print_help()
        return 1
    return 0


if __name__ == "__main__":
    raise SystemExit(main())