"""drill.py: one on-call shift, end to end, integrating M31 + M32 + M33.

The service under operation is the M27 support agent (here a stand-in whose health depends on which
release is live). The story: a bad deploy went out this morning, and operations support detects it,
pages, links a user ticket, mitigates by rolling back, learns (postmortem -> regression eval), and
ships the fix behind a canary. Pure operations logic, offline and deterministic.
"""
from parts import (SLO, sli, burn_rate, alert, Incident, write_postmortem, to_eval_case,
                   triage, route, correlate, ReleaseManager)

# ---- the deployed agent (M27 stand-in): three releases, one of them regressed ----
ANSWERS = {"reset password": "use the settings page",
           "refund window": "30 days",
           "billing owner": "Jordan"}


def v1_stable(q):       # last week's good release
    return ANSWERS[q]


def v2_regressed(q):    # this morning's bad deploy: broke the refund answer
    return {"refund window": "7 days"}.get(q, ANSWERS[q])


def v3_fix(q):          # the corrected release
    return ANSWERS[q]


RELEASE_HEALTH = {"v1-stable": 0.997, "v2-regressed": 0.80, "v3-fix": 0.997}
EVAL_SET = list(ANSWERS.items())
def scorer(got, expected): return got == expected


def health_window(release, n):
    """Deterministic outcome window at the live release's success ratio."""
    good = round(RELEASE_HEALTH[release] * n)
    return [True] * good + [False] * (n - good)


def build_storm():
    """~40 alerts: the agent-api quality drop plus correlated 5xx, and an unrelated singleton."""
    storm = []
    for t in [0, 0, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5]:
        storm.append({"t": t, "service": "agent-api", "symptom": "wrong-answer"})
    for t in [0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5]:
        storm.append({"t": t, "service": "agent-api", "symptom": "5xx"})
    for t in [2, 2, 3, 3, 4, 5, 6, 6, 7, 7, 8]:
        storm.append({"t": t, "service": "vector-store", "symptom": "timeout"})
    for t in [14, 14, 15]:
        storm.append({"t": t, "service": "auth", "symptom": "error"})
    return storm


def run():
    trace = []
    slo = SLO("answer-success", 0.99)

    rm = ReleaseManager()
    rm.register("v1-stable", v1_stable)
    rm.register("v2-regressed", v2_regressed)
    rm.register("v3-fix", v3_fix)
    rm.deploy("v1-stable")
    # the root cause: a release shipped WITHOUT a canary and regressed quality
    rm.release("v2-regressed", EVAL_SET, scorer, min_pass=0.0, regress_tol=1.0)  # forced out, no gate
    trace.append("06:40 bad deploy: v2-regressed shipped with no canary -> live=v2-regressed")

    # 1) AIOps (M32): the alert storm collapses into incidents
    storm = build_storm()
    incidents = correlate(storm)
    trace.append(f"AIOps (M32): {len(storm)} alerts correlated into {len(incidents)} incidents")

    # 2) on-call (M31): SLI is degraded under v2 -> two-window burn alert pages
    fast, slow = health_window("v2-regressed", 100), health_window("v2-regressed", 1000)
    sli_before = sli(fast)
    action, severity, reason = alert(slo, fast, slow)
    inc = Incident("INC-201", severity, "Answer quality dropped after the morning deploy")
    inc.log(f"paged: {reason} (SLI {sli_before:.0%}, burn {burn_rate(slo, fast):.0f}x)")
    trace.append(f"on-call (M31): {action.upper()} {severity} - {reason}")

    # 3) support desk (M32): a user ticket about the same outage, triaged and linked
    ticket_text = "The assistant gives a wrong answer about refunds since this morning"
    tr = triage(ticket_text)
    rt = route(tr)
    inc.log(f"linked user ticket: {tr['severity']} (conf {tr['confidence']}) -> {rt['tier']}")
    trace.append(f"support desk (M32): ticket {tr['severity']} routed {rt['note']}")

    # 4) triage: name the symptom so we can pick the play
    inc.log("triage: symptom = bad_release (v2 deployed this morning, no canary)")

    # 5) mitigate (M31 runbook -> M33 rollback): stop the bleeding by rolling back
    live_before = rm.live
    rolled = rm.rollback()
    inc.mitigate(f"runbook step: rollback live release {live_before} -> {rolled} (M33)")
    sli_after = sli(health_window(rolled, 1000))
    inc.log(f"SLI recovered {sli_before:.0%} -> {sli_after:.0%}")
    trace.append(f"mitigate (M31+M33): rollback {live_before} -> {rolled}; SLI {sli_before:.0%} -> {sli_after:.0%}")

    # 6) resolve + learn (M31 -> M26): postmortem and a regression eval case
    inc.resolve("v2 regressed the refund answer; shipped without a canary; rolled back to v1-stable")
    regression_case = to_eval_case(inc)
    postmortem = write_postmortem(inc)
    trace.append(f"resolve (M31): postmortem written; regression case {regression_case['id']} added to the gate (M26)")

    # 7) ship the fix (M33 canary, gated by the now-stronger eval set)
    fix = rm.release("v3-fix", EVAL_SET, scorer, min_pass=0.8)
    trace.append(f"ship fix (M33): v3-fix canary pass={fix['cand_pass']} vs live {fix['base_pass']} -> {fix['decision']}; live={rm.live}")

    return {
        "trace": trace,
        "alerts": len(storm), "incidents_from_storm": len(incidents),
        "paged": action == "page", "severity": severity, "alert_reason": reason,
        "ticket": {**tr, **rt},
        "incident_id": inc.id, "incident_status": inc.status, "root_cause": inc.cause,
        "sli_before": sli_before, "sli_after": sli_after, "slo_objective": slo.objective,
        "live_before": live_before, "live_after_rollback": rolled, "live_final": rm.live,
        "regression_case": regression_case, "fix_decision": fix["decision"],
        "postmortem": postmortem,
    }


if __name__ == "__main__":
    r = run()
    for line in r["trace"]:
        print(" -", line)
    print(f"\nfinal: live={r['live_final']}  SLI={r['sli_after']:.0%}  incident={r['incident_status']}")
