"""reliability.py: the patterns that keep an agent working when things go wrong.

A demo agent calls the model once and assumes it works. A production agent assumes the call WILL
sometimes fail: the API rate-limits you, times out, or returns a 5xx, and an agent can also loop
forever and burn money. These are small, reusable patterns for each of those failures. They have
no dependencies and run offline.

Patterns:
  retry            : try again on a transient error, waiting longer each time (exponential backoff)
  call_with_deadline: give up on a call that takes too long (timeout)
  fallback         : if the first option fails, try the next (e.g. a simpler path or a safe reply)
  StepLimiter      : cap how many steps an agent may take, so a loop cannot run away
  approval_gate    : require a human yes before a risky, world-changing action (human-in-the-loop)
"""

import time
from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeout


class TransientError(Exception):
    """A failure worth retrying: rate limit, timeout, temporary 5xx."""


class StepLimitExceeded(Exception):
    """The agent tried to take more steps than its cap allows (a runaway loop)."""


class ApprovalDenied(Exception):
    """A risky action was not approved by the human gate."""


def retry(fn, attempts=3, base_delay=0.5, sleep=time.sleep, retry_on=(TransientError,)):
    """Call fn; on a retryable error wait base_delay, then 2x, 4x, ... and try again."""
    last = None
    for i in range(1, attempts + 1):
        try:
            return fn()
        except retry_on as e:
            last = e
            if i == attempts:
                break
            sleep(base_delay * (2 ** (i - 1)))      # exponential backoff: 0.5s, 1s, 2s, ...
    raise last


def call_with_deadline(fn, seconds):
    """Run fn but give up after `seconds`. Raises TransientError on timeout (so retry can catch it).

    Note: Python cannot kill the worker thread, so a timed-out call keeps running in the background;
    in production you also set a client-side request timeout. This models the caller giving up.
    """
    with ThreadPoolExecutor(max_workers=1) as ex:
        future = ex.submit(fn)
        try:
            return future.result(timeout=seconds)
        except FutureTimeout:
            raise TransientError(f"call timed out after {seconds}s")


def fallback(*options):
    """Try each option (a zero-arg function) in order; return the first that succeeds."""
    last = None
    for option in options:
        try:
            return option()
        except Exception as e:
            last = e
    raise last


class StepLimiter:
    """Counts steps and refuses to go past a cap. Call tick() at the top of each loop turn."""

    def __init__(self, max_steps=8):
        self.max_steps = max_steps
        self.count = 0

    def tick(self):
        self.count += 1
        if self.count > self.max_steps:
            raise StepLimitExceeded(f"exceeded {self.max_steps} steps (possible runaway loop)")


def approval_gate(action_description, approver, is_risky=True):
    """Before a risky action, ask the approver (a function) for a yes. Safe actions pass through."""
    if not is_risky:
        return True
    if approver(action_description):
        return True
    raise ApprovalDenied(action_description)
