From 7f01b3750e3de2f9b8ed9fea5472acf02e55033d Mon Sep 17 00:00:00 2001 From: Apoorva Deep Singh <93306950+apoorva5ingh@users.noreply.github.com> Date: Mon, 14 Jul 2025 03:54:30 +0530 Subject: [PATCH] Update gpt.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I created a mini GPT model from scratch using PyTorch, inspired by Karpathy’s educational examples. This project implements all core components of a transformer: multi-head self-attention, feedforward layers, embeddings, and layer normalization. The model is trained on character-level text data and can generate new sequences after training. It includes logic for evaluation, loss tracking, and saving/loading the model. The code is clean and modular, making it perfect for learning how GPT models work internally. This setup is great for experimenting with custom datasets or building lightweight LLMs for small-scale tasks and educational purposes. --- gpt.py | 134 +++++++++++++++++++++------------------------------------ 1 file changed, 50 insertions(+), 84 deletions(-) diff --git a/gpt.py b/gpt.py index e4fc68d6..502a9a78 100644 --- a/gpt.py +++ b/gpt.py @@ -2,9 +2,9 @@ import torch.nn as nn from torch.nn import functional as F -# hyperparameters -batch_size = 64 # how many independent sequences will we process in parallel? -block_size = 256 # what is the maximum context length for predictions? + +batch_size = 64 +block_size = 256 max_iters = 5000 eval_interval = 500 learning_rate = 3e-4 @@ -14,38 +14,34 @@ n_head = 6 n_layer = 6 dropout = 0.2 -# ------------ + torch.manual_seed(1337) -# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt -with open('input.txt', 'r', encoding='utf-8') as f: + +input_file = 'input.txt' +assert os.path.exists(input_file), f"File {input_file} not found!" +with open(input_file, 'r', encoding='utf-8') as f: text = f.read() -# here are all the unique characters that occur in this text chars = sorted(list(set(text))) vocab_size = len(chars) -# create a mapping from characters to integers stoi = { ch:i for i,ch in enumerate(chars) } itos = { i:ch for i,ch in enumerate(chars) } -encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers -decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string +encode = lambda s: [stoi[c] for c in s] +decode = lambda l: ''.join([itos[i] for i in l]) -# Train and test splits data = torch.tensor(encode(text), dtype=torch.long) -n = int(0.9*len(data)) # first 90% will be train, rest val +n = int(0.9 * len(data)) train_data = data[:n] val_data = data[n:] -# data loading def get_batch(split): - # generate a small batch of data of inputs x and targets y data = train_data if split == 'train' else val_data ix = torch.randint(len(data) - block_size, (batch_size,)) x = torch.stack([data[i:i+block_size] for i in ix]) y = torch.stack([data[i+1:i+block_size+1] for i in ix]) - x, y = x.to(device), y.to(device) - return x, y + return x.to(device), y.to(device) @torch.no_grad() def estimate_loss(): @@ -62,50 +58,37 @@ def estimate_loss(): return out class Head(nn.Module): - """ one head of self-attention """ - def __init__(self, head_size): super().__init__() self.key = nn.Linear(n_embd, head_size, bias=False) self.query = nn.Linear(n_embd, head_size, bias=False) self.value = nn.Linear(n_embd, head_size, bias=False) self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size))) - self.dropout = nn.Dropout(dropout) def forward(self, x): - # input of size (batch, time-step, channels) - # output of size (batch, time-step, head size) - B,T,C = x.shape - k = self.key(x) # (B,T,hs) - q = self.query(x) # (B,T,hs) - # compute attention scores ("affinities") - wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T) - wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T) - wei = F.softmax(wei, dim=-1) # (B, T, T) + B, T, C = x.shape + k = self.key(x) + q = self.query(x) + wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 + wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) + wei = F.softmax(wei, dim=-1) wei = self.dropout(wei) - # perform the weighted aggregation of the values - v = self.value(x) # (B,T,hs) - out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs) - return out + v = self.value(x) + return wei @ v class MultiHeadAttention(nn.Module): - """ multiple heads of self-attention in parallel """ - def __init__(self, num_heads, head_size): super().__init__() self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)]) - self.proj = nn.Linear(head_size * num_heads, n_embd) + self.proj = nn.Linear(num_heads * head_size, n_embd) self.dropout = nn.Dropout(dropout) def forward(self, x): out = torch.cat([h(x) for h in self.heads], dim=-1) - out = self.dropout(self.proj(out)) - return out + return self.dropout(self.proj(out)) class FeedFoward(nn.Module): - """ a simple linear layer followed by a non-linearity """ - def __init__(self, n_embd): super().__init__() self.net = nn.Sequential( @@ -119,10 +102,7 @@ def forward(self, x): return self.net(x) class Block(nn.Module): - """ Transformer block: communication followed by computation """ - def __init__(self, n_embd, n_head): - # n_embd: embedding dimension, n_head: the number of heads we'd like super().__init__() head_size = n_embd // n_head self.sa = MultiHeadAttention(n_head, head_size) @@ -136,17 +116,13 @@ def forward(self, x): return x class GPTLanguageModel(nn.Module): - def __init__(self): super().__init__() - # each token directly reads off the logits for the next token from a lookup table self.token_embedding_table = nn.Embedding(vocab_size, n_embd) self.position_embedding_table = nn.Embedding(block_size, n_embd) - self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)]) - self.ln_f = nn.LayerNorm(n_embd) # final layer norm + self.blocks = nn.Sequential(*[Block(n_embd, n_head) for _ in range(n_layer)]) + self.ln_f = nn.LayerNorm(n_embd) self.lm_head = nn.Linear(n_embd, vocab_size) - - # better init, not covered in the original GPT video, but important, will cover in followup video self.apply(self._init_weights) def _init_weights(self, module): @@ -159,18 +135,15 @@ def _init_weights(self, module): def forward(self, idx, targets=None): B, T = idx.shape - - # idx and targets are both (B,T) tensor of integers - tok_emb = self.token_embedding_table(idx) # (B,T,C) - pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C) - x = tok_emb + pos_emb # (B,T,C) - x = self.blocks(x) # (B,T,C) - x = self.ln_f(x) # (B,T,C) - logits = self.lm_head(x) # (B,T,vocab_size) - - if targets is None: - loss = None - else: + tok_emb = self.token_embedding_table(idx) + pos_emb = self.position_embedding_table(torch.arange(T, device=idx.device)) + x = tok_emb + pos_emb + x = self.blocks(x) + x = self.ln_f(x) + logits = self.lm_head(x) + + loss = None + if targets is not None: B, T, C = logits.shape logits = logits.view(B*T, C) targets = targets.view(B*T) @@ -178,48 +151,41 @@ def forward(self, idx, targets=None): return logits, loss + @torch.no_grad() def generate(self, idx, max_new_tokens): - # idx is (B, T) array of indices in the current context for _ in range(max_new_tokens): - # crop idx to the last block_size tokens idx_cond = idx[:, -block_size:] - # get the predictions - logits, loss = self(idx_cond) - # focus only on the last time step - logits = logits[:, -1, :] # becomes (B, C) - # apply softmax to get probabilities - probs = F.softmax(logits, dim=-1) # (B, C) - # sample from the distribution - idx_next = torch.multinomial(probs, num_samples=1) # (B, 1) - # append sampled index to the running sequence - idx = torch.cat((idx, idx_next), dim=1) # (B, T+1) + logits, _ = self(idx_cond) + logits = logits[:, -1, :] + probs = F.softmax(logits, dim=-1) + idx_next = torch.multinomial(probs, num_samples=1) + idx = torch.cat((idx, idx_next), dim=1) return idx model = GPTLanguageModel() -m = model.to(device) -# print the number of parameters in the model -print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters') +model.to(device) +print(f"{sum(p.numel() for p in model.parameters())/1e6:.2f}M parameters") -# create a PyTorch optimizer optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate) for iter in range(max_iters): - - # every once in a while evaluate the loss on train and val sets if iter % eval_interval == 0 or iter == max_iters - 1: losses = estimate_loss() print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}") - # sample a batch of data xb, yb = get_batch('train') - - # evaluate the loss logits, loss = model(xb, yb) optimizer.zero_grad(set_to_none=True) loss.backward() optimizer.step() -# generate from the model + +torch.save(model.state_dict(), "mini_gpt.pth") + + +torch.manual_seed(42) context = torch.zeros((1, 1), dtype=torch.long, device=device) -print(decode(m.generate(context, max_new_tokens=500)[0].tolist())) -#open('more.txt', 'w').write(decode(m.generate(context, max_new_tokens=10000)[0].tolist())) +model.eval() +with torch.no_grad(): + generated = model.generate(context, max_new_tokens=500) + print(decode(generated[0].tolist()))