image.png

image.png

$W_{ax}$ → weight from x embedding to the hidden layer

$W_{aa}$ → weight from hidden layer to the next hidden layer

$W_{ay}$ → weight from hidden layer to the output layer

$a^{<n>}$ → activation at the nth node

$x^{<n>}$ → input at the nth node

$y^{<n>}$ → output at the nth node

$a^{<0>}$ = 0

$a^{<1>}$ = g($W_{aa}$ **$a^{<0>}$ + $W_{ax}$ ** $x^{<1>}$ + $b_a$) ← tanh or relu

$y^{<1>}$ = g($W_{ya}$ * $a^{<1>}$ + $b_y$) ← sigmoid

Screenshot 2025-07-15 at 2.43.09 PM.png

topologically sorted to visualize back propagation flow

My implementation + Colab links

class RNN:
  # helper functions
  def softmax(self, x):
    x_max = np.max(x)
    exps = np.exp(x - x_max)
    return exps / np.sum(exps)

  def __init__(self, vocab_size: int, hs_size: int = 64, lr: float = 0.01):

    self.hs_size = hs_size
    self.vocab_size = vocab_size
    self.lr = lr

    # weights
    self.Whx = np.random.randn(hs_size, vocab_size) * np.sqrt(1.0 / self.vocab_size) #(h, v)
    self.Whh = np.random.randn(hs_size,    hs_size) * np.sqrt(1.0 / self.hs_size)    #(h, h)
    self.Why = np.random.randn(vocab_size, hs_size) * np.sqrt(1.0 / self.hs_size)    #(v, h)

    # biases
    # self.bh = np.zeros((hs_size, 1))                                                 #(h, 1)
    # self.by = np.zeros((vocab_size, 1))                                              #(v, 1)

    self.bh = np.random.randn(hs_size, 1)                                            #(h, 1)
    self.by = np.random.randn(vocab_size, 1)                                         #(v, 1)

    # adam
    self.velocities = [0,0,0,0,0]
    self.G = [0,0,0,0,0]
    self.N = [0,0,0,0,0]
    self.t = 0

  def step(self, inputs, targets, h_prev):

    # one-hot encoding
    inputs = [indices[i] for i in inputs]
    targets = [indices[i] for i in targets]

    xs, hs, ys = {}, {}, {}

    hs[-1] = np.copy(h_prev)

    loss = 0

    for i in range(len(inputs)):
      x = np.zeros((self.vocab_size, 1))
      x[inputs[i]] = 1
      xs[i] = x

      # h = tanh(whx * x + whh * h + bh)
      hs[i] = np.tanh(self.Whx @ x + self.Whh @ hs[i-1] + self.bh)

      # y = softmax(h * Why + by)
      ys[i] = self.softmax(self.Why @ hs[i] + self.by)

      loss += -np.log(ys[i][targets[i], 0] + 1e-8)

    self.backward(xs, hs, ys, targets)

    return loss, hs[i]

  def backward(self, xs, hs, ys, targets):

    #derivative of weights and biases
    dWhy = np.zeros_like(self.Why)
    dWhh = np.zeros_like(self.Whh)
    dWhx = np.zeros_like(self.Whx)

    dby = np.zeros_like(self.by)
    dbh = np.zeros_like(self.bh)

    dh_next = np.zeros_like(hs[0])

    for i in reversed(range(len(xs))):
      dy = np.copy(ys[i])
      dy[targets[i]] -= 1

      dWhy += dy @ hs[i].T
      dby += dy

      dh = self.Why.T @ dy + dh_next
      dh_raw = (1 - hs[i]**2) * dh

      dWhx += dh_raw @ xs[i].T
      dWhh += dh_raw @ hs[i-1].T
      dbh += dh_raw

      dh_next = self.Whh.T @ dh_raw

    for dparam in [dWhx, dWhh, dWhy, dbh, dby]:
      np.clip(dparam, -5, 5, out=dparam)

    self.update_params_adam(dWhy, dWhh, dWhx, dby, dbh)

  # adam (adaptive moment estimation)
  def update_params_adam(self, dWhy, dWhh, dWhx, dby, dbh):
    self.t += 1
    grads = [dWhy, dWhh, dWhx, dby, dbh]
    params = [self.Why, self.Whh, self.Whx, self.by, self.bh]
    beta1, beta2 = 0.99, 0.9999
    for i in range(len(params)):
      self.velocities[i] = self.velocities[i] * beta1 + (1 - beta1) * grads[i]
      self.G[i] = self.G[i] * beta2 + (1 - beta2) * grads[i]**2

      # Bias correction
      v_cap = self.velocities[i] / (1 - beta1**self.t)
      g_cap = self.G[i] / (1 - beta2**self.t)

      n = v_cap / (np.sqrt(g_cap) + 1e-8)
      params[i] -= self.lr * n

  def sample(self):
    h = np.zeros((self.hs_size, 1))
    x = np.zeros((self.vocab_size, 1))
    x[indices["~"]] = 1

    sequence = []

    for _ in range(14):
        # Forward pass
        h = np.tanh(np.dot(self.Whx, x) + np.dot(self.Whh, h) + self.bh)
        y = self.softmax(self.Why @ h + self.by)

        # Sample next character
        char_idx = np.random.choice(range(self.vocab_size), p=y.ravel())
        sequence.append(char_idx)

        # Update input
        x = np.zeros((self.vocab_size, 1))
        x[char_idx] = 1

    return sequence