"""describe_image.py, M12: give the model EYES (image understanding / vision).

So far you've sent text. Many models are *multimodal*, they also accept images. You send
a picture plus a question, and get a text answer: "what's in this photo?", "read the text on
this receipt", "is this chart going up or down?". Same Messages API, one new content block.

Run (venv active, key in .env, from this folder):
    python describe_image.py sample.png "What colour is this image?"
"""

import os
import sys
import base64
from dotenv import load_dotenv
import anthropic

load_dotenv()
client = anthropic.Anthropic()
MODEL = "claude-opus-4-8"          # a multimodal model (accepts images)

# Which media types we support, by file extension.
MEDIA = {".png": "image/png", ".jpg": "image/jpeg", ".jpeg": "image/jpeg",
         ".gif": "image/gif", ".webp": "image/webp"}


def describe(image_path, question):
    """Send an image + a question to the model; return the text answer."""
    ext = os.path.splitext(image_path)[1].lower()
    media_type = MEDIA.get(ext)
    if not media_type:
        return f"Unsupported image type '{ext}'. Use PNG, JPG, GIF, or WEBP."

    # Read the image and base64-encode it (turn bytes into safe text to send).
    with open(image_path, "rb") as f:
        image_data = base64.standard_b64encode(f.read()).decode("utf-8")

    response = client.messages.create(
        model=MODEL,
        max_tokens=500,
        messages=[{
            "role": "user",
            "content": [
                # the image block, this is the only new idea vs a text-only call
                {"type": "image",
                 "source": {"type": "base64", "media_type": media_type, "data": image_data}},
                # ...and your question about it
                {"type": "text", "text": question},
            ],
        }],
    )
    return response.content[0].text


if __name__ == "__main__":
    path = sys.argv[1] if len(sys.argv) > 1 else "sample.png"
    question = sys.argv[2] if len(sys.argv) > 2 else "Describe this image in one sentence."
    print(f"Image: {path}\nQ: {question}\n")
    print("A:", describe(path, question))