"""index_ops.py: keep the knowledge base (the M7 vector store) healthy in production.

A RAG index is not build-once-and-forget. Source documents change, so the indexed copy goes STALE;
the store needs BACKUPS you can actually restore; and data has a lifecycle, PII must be redacted on
write and expired documents swept out (retention). This is a tiny in-memory stand-in for a vector
store that makes those operations concrete. Embeddings are stubbed as a hash so it runs offline; the
operations, staleness, reindex, snapshot/restore, retention, redaction, are the real ones.
"""
import hashlib
import re
import copy


def _hash(text: str) -> str:
    return hashlib.sha256(text.encode()).hexdigest()[:12]


# Redact obvious PII before it is ever written to the index (privacy first, M14/M30).
PII_PATTERNS = [
    (re.compile(r"\b[\w.+-]+@[\w-]+\.[\w.-]+\b"), "[email]"),
    (re.compile(r"\b\d{3}-\d{2}-\d{4}\b"), "[ssn]"),
    (re.compile(r"\b\d{16}\b"), "[card]"),
]


def redact_pii(text: str) -> str:
    for pattern, repl in PII_PATTERNS:
        text = pattern.sub(repl, text)
    return text


class Index:
    """A content-addressed stand-in for a vector store. Each record remembers the hash of the source
    it was built from, which is how we later tell that the source has changed (gone stale)."""

    def __init__(self):
        self.docs = {}     # doc_id -> {source_hash, embedding, indexed_at, expires_at, text}

    def upsert(self, doc_id, text, at, ttl=None, redact=True):
        clean = redact_pii(text) if redact else text
        self.docs[doc_id] = {
            "source_hash": _hash(text),                 # hash of the ORIGINAL source (change detection)
            "embedding": _hash(clean),                  # stub "embedding" of the stored (clean) text
            "indexed_at": at,
            "expires_at": (at + ttl) if ttl is not None else None,
            "text": clean,
        }

    def is_stale(self, doc_id, current_text) -> bool:
        """Stale = we never indexed it, or the live source changed since we did."""
        rec = self.docs.get(doc_id)
        return rec is None or rec["source_hash"] != _hash(current_text)

    def stale_ids(self, sources) -> list:
        """sources: {doc_id: current_text}. Which indexed docs no longer match their source?"""
        return [doc_id for doc_id, text in sources.items() if self.is_stale(doc_id, text)]

    def reindex(self, sources, at) -> list:
        """Re-embed ONLY the changed docs (re-embedding everything is the expensive mistake)."""
        changed = self.stale_ids(sources)
        for doc_id in changed:
            self.upsert(doc_id, sources[doc_id], at)
        return changed

    def sweep_expired(self, now) -> list:
        """Retention: drop docs whose TTL has passed."""
        expired = [doc_id for doc_id, rec in self.docs.items()
                   if rec["expires_at"] is not None and rec["expires_at"] <= now]
        for doc_id in expired:
            del self.docs[doc_id]
        return expired

    def snapshot(self):
        """A backup is just a deep copy you can restore. (A backup you never restore is a hope.)"""
        return copy.deepcopy(self.docs)

    def restore(self, snap):
        self.docs = copy.deepcopy(snap)

    def __len__(self):
        return len(self.docs)
