Andrej-Karpathy

Code

  import math

  class Value:
    def __init__(self, data, _children=(), _op=''):
      self.data = data
      self.grad = 0.0
      self._backward = lambda :None
      self._prev = set(_children)
      self._op = _op

    def __repr__(self):
      return f"Value(data={self.data}, grad={self.grad})"

    def __add__(self, other):
      other = other if isinstance(other, Value) else Value(other)
      out = Value(self.data + other.data, (self, other), '+')

      def _backward():
	  self.grad += 1.0 * out.grad
	  other.grad += 1.0 * out.grad
      out._backward = _backward
      return out
    def __radd__(self, other):
      return self + other

    def __mul__(self, other):
      other = other if isinstance(other, Value) else Value(other)
      out = Value(self.data * other.data, (self, other), '*')

      def _backward():
	self.grad += other.data * out.grad
	other.grad += self.data * out.grad
      out._backward = _backward
      return out

    def __rmul__(self, other): # karpathy knows too much
      return self * other

    def __truediv__(self, other):
      return self * other**-1

    def __sub__(self, other):
      return self + (other * -1) #LOL

    def __pow__(self, other):
      assert isinstance(other, (int, float))
      out = Value(self.data ** other, (self, ), f'**{other}')

      def _backward():
	self.grad += other * self.data**(other-1) * out.grad
      out._backward = _backward
      return out

    def tanh(self):
      x = self.data
      t = (math.exp(2*x)-1)/(math.exp(2*x)+1)
      out = Value(t, (self, ), 'tanh')
      def _backward():
	self.grad += (1 - t**2) * out.grad
      out._backward = _backward
      return out

    def exp(self):
      x = self.data
      out = Value(math.exp(x), (self, ), 'exp')
      def _backward():
	self.grad += out.data * out.grad
      out._backward = _backward
      return out

    def backward(self):
      """implements topological sort and calls the _backward method in a reversed order"""
      topo = []
      visited = set()
      def build_topo(v):
	if v not in visited:
	    visited.add(v)
	    for child in v._prev:
	      build_topo(child)
	    topo.append(v)
      build_topo(self)

      self.grad = 1.0
      for node in reversed(topo):
	  node._backward()

  import random
  class Neuron:
    def __init__(self, nin):
      # nin = number of inputs
      self.w = [Value(random.uniform(-1,1)) for _ in range(nin)]
      self.b = Value(random.uniform(-1,1))

    def parameters(self):
      return self.w + [self.b]

    def __call__(self, x):
      # this is wild; you can call an instance of Neuron as n(x) :O
      # x are the inputs, we are taking a dot product:
      act = sum((wi*xi for wi, xi in zip(self.w, x)), self.b) # sum can take an optional starting value: self.b
      out = act.tanh()
      return out

  class Layer:
    def __init__(self, nin, nout):
      self.neurons = [Neuron(nin) for _ in range(nout)]

    def __call__(self, x):
      outs = [n(x) for n in self.neurons]
      return outs[0] if len(outs) == 1 else outs

    def parameters(self):
      return [p for neuron in self.neurons for p in neuron.parameters()]

  class MLP:
    def __init__(self, nin, nouts):
      sz = [nin] + nouts
      self.layers = [Layer(sz[i], sz[i+1]) for i in range(len(nouts))]

    def __call__(self, x):
      for layer in self.layers:
	x = layer(x)
      return x

    def parameters(self):
      return [p for layer in self.layers for p in layer.parameters()]

  n = MLP(3, [4,4,1])
  xs = [
      [2.0, 3.0, -1, 0],
      [3.0, -1.0, 0.5],
      [0.5, 1.0, 1.0],
      [1.0, 1.0, -1.0]
      ]
  ys = [1.0, -1.0, -1.0, 1.0]

  for k in range(20):
    ypred = [n(x) for x in xs]
    loss = sum((yout - ygt)**2 for ygt, yout in zip(ys, ypred))

    for p in n.parameters(): # without this, you accumulate the gradients and create an artificial momentum.
      p.grad = 0.0
    loss.backward()
    
    for p in n.parameters():
        p.data += -0.01 * p.grad
    print(k, loss.data)

  0 6.827977524064868
  1 6.564699072251051
  2 6.231394384341386
  3 5.851082578630887
  4 5.468387820425157
  5 5.114416471232759
  6 4.795337579218638
  7 4.5056247601912105
  8 4.237419335600638
  9 3.9834296130886995
  10 3.7377002240069004
  11 3.4958631215074445
  12 3.2552411075315213
  13 3.0149114257272585
  14 2.775727346928721
  15 2.5401826675264108
  16 2.3119463645904297
  17 2.0950143399155294
  18 1.8927285043107416
  19 1.7071191357642472

ypred

Value

(data=0.5774378654478666 grad=-0.8451242691042669)

Value

(data=-0.708228299005086 grad=0.583543401989828)

Value

(data=-0.2457719356156506 grad=1.5084561287686988)

Value

(data=0.06481569752237379 grad=-1.8703686049552524)

References

https://www.youtube.com/watch?v=VMj-3S1tku0&list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ

Andrej Karpathy Video

Code

Pulling the dataset we will be working on:

  curl https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt -o input.txt

Reading it into python

  with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

Data inspection

print("length of dataset in characters: ", len(text))
print("length of data: ", len(data))

length of dataset in characters:  1115394
length of data:  1115394

  print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.

  chars = sorted(list(set(text)))
  vocab_size = len(chars)
  print(''.join(chars))
  print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65

Tokeniser

  stoi = { ch:i for i,ch in enumerate(chars) }
  itos = { i:ch for i,ch in enumerate(chars) }
  encode = lambda s: [stoi[c] for c in s]
  # defines function taking in string, outputs list of ints
  decode = lambda l: ''.join([itos[i] for i in l])
  # input: list of integers, outputs string

  print(encode("hello world"))
  print(decode(encode("hello world")))

[46, 43, 50, 50, 53, 1, 61, 53, 56, 50, 42]
hello world

  import torch
  data = torch.tensor(encode(text), dtype=torch.long)
  print(data.shape, data.dtype)
  print(data[:1000])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
        47, 59, 57,  1, 47, 57,  1, 41, 46, 47, 43, 44,  1, 43, 52, 43, 51, 63,
         1, 58, 53,  1, 58, 46, 43,  1, 54, 43, 53, 54, 50, 43,  8,  0,  0, 13,
        50, 50, 10,  0, 35, 43,  1, 49, 52, 53, 61,  5, 58,  6,  1, 61, 43,  1,
        49, 52, 53, 61,  5, 58,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47, 58,
        47, 64, 43, 52, 10,  0, 24, 43, 58,  1, 59, 57,  1, 49, 47, 50, 50,  1,
        46, 47, 51,  6,  1, 39, 52, 42,  1, 61, 43,  5, 50, 50,  1, 46, 39, 60,
        43,  1, 41, 53, 56, 52,  1, 39, 58,  1, 53, 59, 56,  1, 53, 61, 52,  1,
        54, 56, 47, 41, 43,  8,  0, 21, 57,  5, 58,  1, 39,  1, 60, 43, 56, 42,
        47, 41, 58, 12,  0,  0, 13, 50, 50, 10,  0, 26, 53,  1, 51, 53, 56, 43,
         1, 58, 39, 50, 49, 47, 52, 45,  1, 53, 52,  5, 58, 11,  1, 50, 43, 58,
         1, 47, 58,  1, 40, 43,  1, 42, 53, 52, 43, 10,  1, 39, 61, 39, 63,  6,
         1, 39, 61, 39, 63,  2,  0,  0, 31, 43, 41, 53, 52, 42,  1, 15, 47, 58,
        47, 64, 43, 52, 10,  0, 27, 52, 43,  1, 61, 53, 56, 42,  6,  1, 45, 53,
        53, 42,  1, 41, 47, 58, 47, 64, 43, 52, 57,  8,  0,  0, 18, 47, 56, 57,
        58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 35, 43,  1, 39, 56, 43,  1,
        39, 41, 41, 53, 59, 52, 58, 43, 42,  1, 54, 53, 53, 56,  1, 41, 47, 58,
        47, 64, 43, 52, 57,  6,  1, 58, 46, 43,  1, 54, 39, 58, 56, 47, 41, 47,
        39, 52, 57,  1, 45, 53, 53, 42,  8,  0, 35, 46, 39, 58,  1, 39, 59, 58,
        46, 53, 56, 47, 58, 63,  1, 57, 59, 56, 44, 43, 47, 58, 57,  1, 53, 52,
         1, 61, 53, 59, 50, 42,  1, 56, 43, 50, 47, 43, 60, 43,  1, 59, 57, 10,
         1, 47, 44,  1, 58, 46, 43, 63,  0, 61, 53, 59, 50, 42,  1, 63, 47, 43,
        50, 42,  1, 59, 57,  1, 40, 59, 58,  1, 58, 46, 43,  1, 57, 59, 54, 43,
        56, 44, 50, 59, 47, 58, 63,  6,  1, 61, 46, 47, 50, 43,  1, 47, 58,  1,
        61, 43, 56, 43,  0, 61, 46, 53, 50, 43, 57, 53, 51, 43,  6,  1, 61, 43,
         1, 51, 47, 45, 46, 58,  1, 45, 59, 43, 57, 57,  1, 58, 46, 43, 63,  1,
        56, 43, 50, 47, 43, 60, 43, 42,  1, 59, 57,  1, 46, 59, 51, 39, 52, 43,
        50, 63, 11,  0, 40, 59, 58,  1, 58, 46, 43, 63,  1, 58, 46, 47, 52, 49,
         1, 61, 43,  1, 39, 56, 43,  1, 58, 53, 53,  1, 42, 43, 39, 56, 10,  1,
        58, 46, 43,  1, 50, 43, 39, 52, 52, 43, 57, 57,  1, 58, 46, 39, 58,  0,
        39, 44, 44, 50, 47, 41, 58, 57,  1, 59, 57,  6,  1, 58, 46, 43,  1, 53,
        40, 48, 43, 41, 58,  1, 53, 44,  1, 53, 59, 56,  1, 51, 47, 57, 43, 56,
        63,  6,  1, 47, 57,  1, 39, 57,  1, 39, 52,  0, 47, 52, 60, 43, 52, 58,
        53, 56, 63,  1, 58, 53,  1, 54, 39, 56, 58, 47, 41, 59, 50, 39, 56, 47,
        57, 43,  1, 58, 46, 43, 47, 56,  1, 39, 40, 59, 52, 42, 39, 52, 41, 43,
        11,  1, 53, 59, 56,  0, 57, 59, 44, 44, 43, 56, 39, 52, 41, 43,  1, 47,
        57,  1, 39,  1, 45, 39, 47, 52,  1, 58, 53,  1, 58, 46, 43, 51,  1, 24,
        43, 58,  1, 59, 57,  1, 56, 43, 60, 43, 52, 45, 43,  1, 58, 46, 47, 57,
         1, 61, 47, 58, 46,  0, 53, 59, 56,  1, 54, 47, 49, 43, 57,  6,  1, 43,
        56, 43,  1, 61, 43,  1, 40, 43, 41, 53, 51, 43,  1, 56, 39, 49, 43, 57,
        10,  1, 44, 53, 56,  1, 58, 46, 43,  1, 45, 53, 42, 57,  1, 49, 52, 53,
        61,  1, 21,  0, 57, 54, 43, 39, 49,  1, 58, 46, 47, 57,  1, 47, 52,  1,
        46, 59, 52, 45, 43, 56,  1, 44, 53, 56,  1, 40, 56, 43, 39, 42,  6,  1,
        52, 53, 58,  1, 47, 52,  1, 58, 46, 47, 56, 57, 58,  1, 44, 53, 56,  1,
        56, 43, 60, 43, 52, 45, 43,  8,  0,  0])

  n = int(0.9*len(data))
  train_data = data[:n]
  val_data = data[n:]

Understanding the context influence of n+1th token

  block_size = 8
  print(train_data[:block_size])
  x = train_data[:block_size]
  y = train_data[1:block_size+1]
  for t in range(block_size):
      context = x[:t+1]
      target = y[t]
      print(f"at input {context}\n" +
	    f"target {target}")

tensor([18, 47, 56, 57, 58,  1, 15, 47])
at input tensor([18])
target 47
at input tensor([18, 47])
target 56
at input tensor([18, 47, 56])
target 57
at input tensor([18, 47, 56, 57])
target 58
at input tensor([18, 47, 56, 57, 58])
target 1
at input tensor([18, 47, 56, 57, 58,  1])
target 15
at input tensor([18, 47, 56, 57, 58,  1, 15])
target 47
at input tensor([18, 47, 56, 57, 58,  1, 15, 47])
target 58

Note that within the block_size of 8, there are 8 total examples.

Andrej-Karpathy

Micrograd

Code

References

MinGPT

NanoGPT - Min with Teeth

Andrej Karpathy Video

Code

Data inspection

Tokeniser

Understanding the context influence of n+1th token