"""run_evals.py: the command CI runs. Prints a scorecard, records the run, and sets the EXIT CODE.

Exit code is the whole point: 0 means the gate passed (merge allowed), non-zero means it failed (CI
blocks the merge). A GitHub Actions step that runs this will go red automatically when evals regress.

Usage:
    python run_evals.py            # gate the current app (exit 0 pass / 1 fail)
    python run_evals.py --buggy    # simulate a regression and watch the gate fail
"""

import sys
import json
import os
from app import respond
from evalset import run_suite, gate, THRESHOLD

HISTORY = os.path.join(os.path.dirname(__file__), "eval_history.json")


def record(rate):
    """Append this run's pass rate so quality can be tracked over time."""
    history = []
    if os.path.exists(HISTORY):
        try:
            history = json.load(open(HISTORY))
        except ValueError:
            history = []
    history.append(round(rate, 3))
    json.dump(history[-20:], open(HISTORY, "w"))
    return history[-5:]


def main(argv):
    buggy = "--buggy" in argv
    summary = run_suite(lambda q: respond(q, buggy=buggy))

    print("EVAL GATE")
    for r in summary["results"]:
        mark = "pass" if r["pass"] else "FAIL"
        print(f"  [{mark}] {r['id']}: {r['answer']!r}")
    print(f"  ---- {summary['passed']}/{summary['total']} "
          f"({summary['rate']*100:.0f}%), threshold {THRESHOLD*100:.0f}%")
    print(f"  recent pass rates: {record(summary['rate'])}")

    if gate(summary):
        print("  GATE PASSED: merge allowed (exit 0)")
        return 0
    print("  GATE FAILED: merge blocked (exit 1)")
    return 1


if __name__ == "__main__":
    sys.exit(main(sys.argv[1:]))
