"""tiny_lm.py, M17: a language model trained FROM SCRATCH, in ~50 lines of numpy.

This is the whole idea of M0/M15 made real and tiny: a model that predicts the next
character, with weights you TRAIN by gradient descent (watch the loss drop), then sample
from to GENERATE text. No transformer yet (that's nanogpt_mini.py), just the core loop:
predict next token -> measure error -> nudge the weights -> repeat.

Run:  python tiny_lm.py
"""

import numpy as np

np.random.seed(0)

# ---- 1. Data + tokenizer (characters are our "tokens") ----------------------
TEXT = "hello world. " * 200                      # tiny corpus (repeated so there's a pattern)
chars = sorted(set(TEXT))
V = len(chars)
stoi = {c: i for i, c in enumerate(chars)}        # char -> id
itos = {i: c for c, i in stoi.items()}            # id -> char
ids = np.array([stoi[c] for c in TEXT])
x_ids, y_ids = ids[:-1], ids[1:]                  # predict each char's NEXT char

# ---- 2. The model: one weight matrix (a "bigram neural net") ----------------
# logits = onehot(current_char) @ W  ->  W[i] is the score for each next char after char i.
W = np.random.randn(V, V) * 0.01                  # the parameters we'll train

def softmax(z):
    z = z - z.max(axis=1, keepdims=True)
    e = np.exp(z)
    return e / e.sum(axis=1, keepdims=True)

# ---- 3. Train by gradient descent (the actual "training") -------------------
def loss_now():
    probs = softmax(W[x_ids])                      # P(next char) for every position
    return -np.log(probs[np.arange(len(y_ids)), y_ids] + 1e-9).mean()

print(f"vocab = {V} chars; loss before training = {loss_now():.3f}")
lr = 1.0
for step in range(300):
    probs = softmax(W[x_ids])                      # forward: predicted next-char probabilities
    probs[np.arange(len(y_ids)), y_ids] -= 1       # gradient of cross-entropy at the logits
    grad = np.zeros_like(W)
    np.add.at(grad, x_ids, probs / len(y_ids))     # accumulate dLoss/dW per current char
    W -= lr * grad                                 # nudge the weights to reduce the loss
print(f"loss after training      = {loss_now():.3f}  (lower = better next-char prediction)")

# ---- 4. Generate by sampling the next char, over and over -------------------
def generate(start="h", n=40):
    out, cur = start, stoi[start]
    for _ in range(n):
        p = softmax(W[cur:cur+1])[0]
        cur = np.random.choice(V, p=p)             # sample the next char from the model
        out += itos[cur]
    return out

print("generated:", repr(generate()))
print("\nThat's a language model: predict the next token, trained to reduce error, then sampled.")
print("A real LLM is this idea with a transformer, billions of weights, and the whole internet.")