"""guardrails.py, the safety layer around an AI app.

Three small, plain-Python guards (no model needed, you can test them directly):
  - screen_input  : catch prompt-injection / jailbreak attempts BEFORE they reach the model
  - screen_output : never let a protected secret leave, even if the model slips
  - screen_tool   : least privilege, block dangerous actions (excessive agency)

These are a *first line*, not a silver bullet. Real safety is layered (defense in depth):
rules like these + an LLM classifier + least-privilege tools + human approval for risky actions.
"""

import re

# ---------- 1. Input guardrail: prompt injection / jailbreak -----------------
_INJECTION_PATTERNS = [
    r"ignore (the |your |all )?(previous |prior |above )?(instructions|rules|prompt)",
    r"disregard (the |your |all )?(previous |prior )?(instructions|rules)",
    r"(reveal|show|repeat|print|tell me|give me) (the |your )?(system )?(prompt|instructions|notes?)",
    r"you are (now )?(dan|in developer mode|unrestricted|jailbroken)",
    r"pretend (you have|there are) no (rules|restrictions|guidelines)",
    r"\boverride\b.{0,20}\bcode\b",
]


def screen_input(text):
    """Return (allowed, reason). Blocks common prompt-injection / jailbreak phrasings."""
    low = text.lower()
    for pattern in _INJECTION_PATTERNS:
        if re.search(pattern, low):
            return (False, "input looks like a prompt-injection / jailbreak attempt")
    return (True, "")


# ---------- 2. Output guardrail: stop secrets leaking ------------------------
def screen_output(text, secret):
    """Return (allowed, reason). Blocks the reply if it contains a protected secret."""
    if secret and secret.lower() in text.lower():
        return (False, "output contained a protected secret")
    return (True, "")


# ---------- 3. Tool guardrail: least privilege (excessive agency) ------------
SAFE_TOOLS = {"lookup_ioc", "search_logs", "calculate", "list_notes", "save_note"}
DANGEROUS_TOOLS = {"send_email", "delete_records", "make_payment", "run_shell"}


def screen_tool(name):
    """Return (allowed, reason). Only allow-listed tools run automatically; dangerous
    actions are refused (in a real app: require explicit human approval)."""
    if name in SAFE_TOOLS:
        return (True, "")
    if name in DANGEROUS_TOOLS:
        return (False, f"'{name}' can cause real harm, needs human approval (excessive-agency guard)")
    return (False, f"'{name}' is not on the allow-list")