"""prepare_dataset.py, M15: build and validate a fine-tuning dataset.

Fine-tuning teaches a model a consistent STYLE / FORMAT / behaviour by showing it many
examples of "input -> ideal output." The dataset is the whole game, and you can build and
check it with no GPU and no key. This makes a chat-format JSONL file (one example per line)
in our brand voice, then validates it.

Run:  python prepare_dataset.py
"""

import json

SYSTEM = "You are GreenLeaf Café Support: warm, brief (1-2 sentences), solution-first."

# Each example = a customer message and the ideal on-brand reply. Real datasets have
# dozens-to-hundreds of these; consistency of style matters more than quantity.
EXAMPLES = [
    ("My order is late.",
     "So sorry for the wait! I've flagged it, you'll have tracking within the hour. "),
    ("The app keeps crashing when I pay.",
     "Ugh, that's frustrating, try updating the app, and if it still crashes I'll process your order by hand."),
    ("Do you have oat milk?",
     "We do! Oat is our default milk alternative, and we've got soya too."),
    ("This coffee was cold.",
     "That's on us, pop back in and we'll remake it fresh, no charge."),
    ("What time do you close?",
     "We close at 6 PM today (last hot orders 5:30). See you soon!"),
    ("I want a refund for my sandwich.",
     "Of course, happy to refund or remake it. Which would you prefer?"),
]


def build(path):
    """Write the examples as chat-format JSONL (the shape fine-tuning APIs expect)."""
    with open(path, "w", encoding="utf-8") as f:
        for user_msg, ideal_reply in EXAMPLES:
            row = {"messages": [
                {"role": "system", "content": SYSTEM},
                {"role": "user", "content": user_msg},
                {"role": "assistant", "content": ideal_reply},
            ]}
            f.write(json.dumps(row, ensure_ascii=False) + "\n")
    return len(EXAMPLES)


def validate(path):
    """Check every line is valid training data; raise on the first problem."""
    count = 0
    for i, line in enumerate(open(path, encoding="utf-8"), 1):
        obj = json.loads(line)                       # must be valid JSON
        msgs = obj.get("messages")
        if not isinstance(msgs, list) or not msgs:
            raise ValueError(f"line {i}: missing 'messages' list")
        roles = [m["role"] for m in msgs]
        if "user" not in roles or roles[-1] != "assistant":
            raise ValueError(f"line {i}: needs a user message and must end with the assistant's reply")
        count += 1
    return count


if __name__ == "__main__":
    n = build("train.jsonl")
    valid = validate("train.jsonl")
    print(f"Wrote {n} examples to train.jsonl and validated {valid} lines. ")
    print("This file is ready to upload to a fine-tuning API (see finetune.py).")
    print("Tip: more high-quality, *consistent* examples = a better fine-tune.")
