"""runbook.py: turn "what do we do at 2am?" into a checklist anyone on-call can follow.

A runbook is a named, ordered list of mitigation steps for a known failure. Each step is a small,
reversible, safe action (flip to a fallback model, turn on caching, shed low-priority traffic,
escalate). A runbook is what lets a tired on-call engineer act correctly without having to be the
person who built the system.

Here a runbook runs against a MockSystem: each action changes a knob, and the system's health
recovers when the right mitigation is applied. Dependency-free, offline, deterministic.
"""
from oncall import sli


# Runbooks keyed by symptom. In production these live in your wiki next to the alert that fires them.
RUNBOOKS = {
    "model_provider_outage": {
        "title": "Model provider returning 5xx / timeouts",
        "symptoms": ["error_rate high", "latency high", "upstream 503"],
        "steps": [
            {"name": "Switch to the fallback model", "action": "use_fallback_model"},
            {"name": "Serve cached answers where possible", "action": "enable_cache"},
            {"name": "Shed low-priority traffic", "action": "shed_load"},
            {"name": "If still failing, escalate to L2 and post a status page update", "action": "escalate_L2"},
        ],
    },
    "runaway_cost": {
        "title": "Token spend spiking (possible runaway loop)",
        "symptoms": ["cost per request high", "step counts high"],
        "steps": [
            {"name": "Lower the per-request step cap", "action": "tighten_step_cap"},
            {"name": "Route easy requests to the cheap model", "action": "use_fallback_model"},
            {"name": "Escalate to the owning team", "action": "escalate_L2"},
        ],
    },
}


class MockSystem:
    """A stand-in for the deployed AI service. Mitigations flip its knobs; health responds.

    `good_ratio` is the fraction of requests currently succeeding. The outage drags it down; the
    right mitigations bring it back up. This is a teaching model, not a real service, but the
    cause->mitigation->recover shape is exactly the real one.
    """

    def __init__(self, good_ratio=0.5):
        self.good_ratio = good_ratio          # start unhealthy (mid-outage)
        self.using_fallback = False
        self.cache_on = False
        self.load_shed = False
        self.step_cap = 8
        self.escalated_to = None
        self.applied = []

    def apply(self, action: str):
        """Apply one mitigation action and let health recover accordingly."""
        self.applied.append(action)
        if action == "use_fallback_model" and not self.using_fallback:
            self.using_fallback = True
            self.good_ratio = min(1.0, self.good_ratio + 0.35)   # fallback is healthy
        elif action == "enable_cache" and not self.cache_on:
            self.cache_on = True
            self.good_ratio = min(1.0, self.good_ratio + 0.10)
        elif action == "shed_load" and not self.load_shed:
            self.load_shed = True
            self.good_ratio = min(1.0, self.good_ratio + 0.05)
        elif action == "tighten_step_cap":
            self.step_cap = 3
            self.good_ratio = min(1.0, self.good_ratio + 0.10)
        elif action == "escalate_L2":
            self.escalated_to = "L2"
        return self.good_ratio

    def sample(self, n=100):
        """Produce n outcome booleans at the current health (deterministic, no randomness)."""
        good = round(self.good_ratio * n)
        return [True] * good + [False] * (n - good)


def choose_runbook(symptom_key: str):
    return RUNBOOKS.get(symptom_key)


def run_runbook(runbook, system: MockSystem, slo, incident=None, recover_at=0.99, step0=0):
    """Execute runbook steps in order, stopping as soon as the system is healthy again.

    Returns the list of (step_name, action, sli_after) actually executed. If an Incident is passed,
    each step is logged to its timeline. Stops early once the SLI meets `recover_at` (don't keep
    applying mitigations you no longer need).
    """
    executed = []
    for i, step in enumerate(runbook["steps"], start=1):
        system.apply(step["action"])
        measured = sli(system.sample())
        executed.append((step["name"], step["action"], measured))
        if incident is not None:
            incident.mitigate(step0 + i, f"{step['name']} -> SLI {measured:.0%}")
        if measured >= recover_at:
            break
    return executed