"""oncall.py: define "healthy", measure it, and decide when to wake a human.

Operations support safeguards what the rest of the course BUILT. Step one is to make "healthy"
a number you can watch, not a feeling. This file is the SRE backbone applied to an AI service:

  SLO            : the promise, e.g. "99% of requests succeed" (objective)
  SLI            : the measurement, e.g. the success ratio you actually observed
  error budget   : how much failure the SLO allows before you must act (1 - objective)
  burn rate      : how fast you are spending that budget (1x = exactly on budget, >1x = too fast)
  two-window alert: page on a FAST burn (budget gone in hours), ticket on a slow burn
  Incident       : the record you open when paged, with a timeline you can hand to a postmortem

No dependencies, no network, fully offline and deterministic.
"""
from dataclasses import dataclass, field

# Severity: how bad, and how fast a human must respond.
SEVERITY = {
    "sev1": "critical: user-facing outage, page now",
    "sev2": "major: degraded, page during hours",
    "sev3": "minor: open a ticket, fix this week",
}


@dataclass
class SLO:
    """A service-level objective: the promise you operate against."""
    name: str
    objective: float = 0.99          # 0.99 => 99% of requests must be "good"
    description: str = ""

    @property
    def error_budget(self) -> float:
        """Fraction of requests allowed to be bad and still meet the SLO."""
        return 1.0 - self.objective


def sli(outcomes) -> float:
    """The measured indicator. outcomes: list of bool (True = good request). Returns success ratio."""
    if not outcomes:
        return 1.0
    return sum(1 for ok in outcomes if ok) / len(outcomes)


def bad_fraction(outcomes) -> float:
    return 1.0 - sli(outcomes)


def burn_rate(slo: SLO, outcomes) -> float:
    """How fast the error budget is being spent. 1x = on budget, 10x = spending 10x too fast."""
    allowed = slo.error_budget
    bad = bad_fraction(outcomes)
    if allowed == 0:
        return float("inf") if bad > 0 else 0.0
    return bad / allowed


def budget_remaining(slo: SLO, outcomes) -> float:
    """Fraction of the error budget left: 1.0 = untouched, 0 = exhausted, <0 = SLO breached."""
    allowed = slo.error_budget
    if allowed == 0:
        return 1.0 if bad_fraction(outcomes) == 0 else -1.0
    return (allowed - bad_fraction(outcomes)) / allowed


# Google-SRE-style multi-window burn-rate thresholds (simplified, but the real shape).
# Fast: a short window burning very hot -> the budget is gone in hours -> PAGE.
# Slow: a long window burning sustained -> a real problem, but not a 2am page.
FAST_PAGE = 14.4   # ~2% of a 30-day budget in 1 hour
SLOW_PAGE = 6.0
SLOW_TICKET = 1.0


def alert(slo: SLO, fast_window, slow_window):
    """Two-window burn-rate alert. Returns (action, severity, reason).

    action is one of: "page" (wake a human), "ticket" (queue it), "ok" (within budget).
    fast_window / slow_window are recent outcome lists (short and long look-back).
    """
    fast = burn_rate(slo, fast_window)
    slow = burn_rate(slo, slow_window)
    if fast >= FAST_PAGE and slow >= SLOW_TICKET:
        return ("page", "sev1", f"fast burn {fast:.1f}x (error budget gone in hours)")
    if slow >= SLOW_PAGE:
        return ("page", "sev2", f"sustained burn {slow:.1f}x")
    if slow >= SLOW_TICKET:
        return ("ticket", "sev3", f"slow burn {slow:.1f}x (chips away at the budget)")
    return ("ok", None, f"within budget (burn {slow:.2f}x)")


@dataclass
class Incident:
    """The record opened when an alert pages someone. Its timeline feeds the postmortem."""
    id: str
    severity: str
    title: str
    detected_step: int
    status: str = "open"             # open -> mitigated -> resolved
    cause: str = ""
    timeline: list = field(default_factory=list)
    mitigations: list = field(default_factory=list)

    def log(self, step: int, note: str):
        self.timeline.append((step, note))

    def mitigate(self, step: int, action: str):
        self.status = "mitigated"
        self.mitigations.append(action)
        self.log(step, f"MITIGATE: {action}")

    def resolve(self, step: int, cause: str):
        self.status = "resolved"
        self.cause = cause
        self.log(step, f"RESOLVED: root cause = {cause}")

    @property
    def duration_steps(self) -> int:
        if not self.timeline:
            return 0
        return self.timeline[-1][0] - self.detected_step


def write_postmortem(inc: Incident) -> str:
    """Render a blameless postmortem as Markdown. Blameless = blame the system, not the person."""
    lines = []
    lines.append(f"# Postmortem: {inc.title}")
    lines.append("")
    lines.append(f"- **Incident:** {inc.id}")
    lines.append(f"- **Severity:** {inc.severity} ({SEVERITY.get(inc.severity, '')})")
    lines.append(f"- **Status:** {inc.status}")
    lines.append(f"- **Duration:** {inc.duration_steps} steps (detection to resolution)")
    lines.append(f"- **Root cause:** {inc.cause or 'unknown'}")
    lines.append("")
    lines.append("## Timeline")
    for step, note in inc.timeline:
        lines.append(f"- step {step}: {note}")
    lines.append("")
    lines.append("## What we did to mitigate")
    for m in inc.mitigations:
        lines.append(f"- {m}")
    lines.append("")
    lines.append("## Action items (blameless: fix the system, not the person)")
    lines.append("- [ ] Add a regression eval for this failure (see `to_eval_case`, ties to M26).")
    lines.append("- [ ] Add/update the runbook step that mitigated it fastest.")
    lines.append("- [ ] Add an alert if this failure had no SLO coverage.")
    return "\n".join(lines)


def to_eval_case(inc: Incident) -> dict:
    """Turn an incident into a regression eval case (the shape M26's eval gate consumes).

    Every incident becomes a test, so the same failure cannot ship twice unnoticed.
    """
    return {
        "id": f"regression-{inc.id}",
        "origin": "incident",
        "severity": inc.severity,
        "scenario": inc.title,
        "root_cause": inc.cause,
        "expect": "service stays within SLO / degrades safely under this condition",
    }