

$W_{ax}$ → weight from x embedding to the hidden layer
$W_{aa}$ → weight from hidden layer to the next hidden layer
$W_{ay}$ → weight from hidden layer to the output layer
$a^{<n>}$ → activation at the nth node
$x^{<n>}$ → input at the nth node
$y^{<n>}$ → output at the nth node
$a^{<0>}$ = 0
$a^{<1>}$ = g($W_{aa}$ **$a^{<0>}$ + $W_{ax}$ ** $x^{<1>}$ + $b_a$) ← tanh or relu
$y^{<1>}$ = g($W_{ya}$ * $a^{<1>}$ + $b_y$) ← sigmoid

topologically sorted to visualize back propagation flow
class RNN:
# helper functions
def softmax(self, x):
x_max = np.max(x)
exps = np.exp(x - x_max)
return exps / np.sum(exps)
def __init__(self, vocab_size: int, hs_size: int = 64, lr: float = 0.01):
self.hs_size = hs_size
self.vocab_size = vocab_size
self.lr = lr
# weights
self.Whx = np.random.randn(hs_size, vocab_size) * np.sqrt(1.0 / self.vocab_size) #(h, v)
self.Whh = np.random.randn(hs_size, hs_size) * np.sqrt(1.0 / self.hs_size) #(h, h)
self.Why = np.random.randn(vocab_size, hs_size) * np.sqrt(1.0 / self.hs_size) #(v, h)
# biases
# self.bh = np.zeros((hs_size, 1)) #(h, 1)
# self.by = np.zeros((vocab_size, 1)) #(v, 1)
self.bh = np.random.randn(hs_size, 1) #(h, 1)
self.by = np.random.randn(vocab_size, 1) #(v, 1)
# adam
self.velocities = [0,0,0,0,0]
self.G = [0,0,0,0,0]
self.N = [0,0,0,0,0]
self.t = 0
def step(self, inputs, targets, h_prev):
# one-hot encoding
inputs = [indices[i] for i in inputs]
targets = [indices[i] for i in targets]
xs, hs, ys = {}, {}, {}
hs[-1] = np.copy(h_prev)
loss = 0
for i in range(len(inputs)):
x = np.zeros((self.vocab_size, 1))
x[inputs[i]] = 1
xs[i] = x
# h = tanh(whx * x + whh * h + bh)
hs[i] = np.tanh(self.Whx @ x + self.Whh @ hs[i-1] + self.bh)
# y = softmax(h * Why + by)
ys[i] = self.softmax(self.Why @ hs[i] + self.by)
loss += -np.log(ys[i][targets[i], 0] + 1e-8)
self.backward(xs, hs, ys, targets)
return loss, hs[i]
def backward(self, xs, hs, ys, targets):
#derivative of weights and biases
dWhy = np.zeros_like(self.Why)
dWhh = np.zeros_like(self.Whh)
dWhx = np.zeros_like(self.Whx)
dby = np.zeros_like(self.by)
dbh = np.zeros_like(self.bh)
dh_next = np.zeros_like(hs[0])
for i in reversed(range(len(xs))):
dy = np.copy(ys[i])
dy[targets[i]] -= 1
dWhy += dy @ hs[i].T
dby += dy
dh = self.Why.T @ dy + dh_next
dh_raw = (1 - hs[i]**2) * dh
dWhx += dh_raw @ xs[i].T
dWhh += dh_raw @ hs[i-1].T
dbh += dh_raw
dh_next = self.Whh.T @ dh_raw
for dparam in [dWhx, dWhh, dWhy, dbh, dby]:
np.clip(dparam, -5, 5, out=dparam)
self.update_params_adam(dWhy, dWhh, dWhx, dby, dbh)
# adam (adaptive moment estimation)
def update_params_adam(self, dWhy, dWhh, dWhx, dby, dbh):
self.t += 1
grads = [dWhy, dWhh, dWhx, dby, dbh]
params = [self.Why, self.Whh, self.Whx, self.by, self.bh]
beta1, beta2 = 0.99, 0.9999
for i in range(len(params)):
self.velocities[i] = self.velocities[i] * beta1 + (1 - beta1) * grads[i]
self.G[i] = self.G[i] * beta2 + (1 - beta2) * grads[i]**2
# Bias correction
v_cap = self.velocities[i] / (1 - beta1**self.t)
g_cap = self.G[i] / (1 - beta2**self.t)
n = v_cap / (np.sqrt(g_cap) + 1e-8)
params[i] -= self.lr * n
def sample(self):
h = np.zeros((self.hs_size, 1))
x = np.zeros((self.vocab_size, 1))
x[indices["~"]] = 1
sequence = []
for _ in range(14):
# Forward pass
h = np.tanh(np.dot(self.Whx, x) + np.dot(self.Whh, h) + self.bh)
y = self.softmax(self.Why @ h + self.by)
# Sample next character
char_idx = np.random.choice(range(self.vocab_size), p=y.ravel())
sequence.append(char_idx)
# Update input
x = np.zeros((self.vocab_size, 1))
x[char_idx] = 1
return sequence