Backpropagation Example for Simple Recurrent Network

In [1]:
# a SimpleRNN model
import numpy as np
import keras.backend as K
from keras.layers import Dense, Embedding, SimpleRNN
from keras.models import Sequential
model = Sequential()
model.add(Embedding(8,3))
model.add(SimpleRNN(4))
model.add(Dense(1, activation = "sigmoid"))
model.compile(loss = "binary_crossentropy", optimizer = "sgd", metrics = [ "accuracy" ])
model.summary()
Using TensorFlow backend.
WARNING:tensorflow:From C:\Users\dadeb\Anaconda3\lib\site-packages\tensorflow\python\framework\op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_1 (Embedding)      (None, None, 3)           24        
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 4)                 32        
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 5         
=================================================================
Total params: 61
Trainable params: 61
Non-trainable params: 0
_________________________________________________________________
In [2]:
# a prediction
X = np.array([ 4, 6 ]).astype("int32").reshape(1, 2)
Y = np.array([ 1 ]).astype("float32").reshape(1, 1)
model.predict(X)
Out[2]:
array([[0.48414087]], dtype=float32)
In [3]:
# gathering the weights of the model
P = model.layers[0].get_weights()[0]  # embedding parameters
Q = model.layers[1].get_weights()[0]  # RNN cell parameters for previous layer
R = model.layers[1].get_weights()[2]  # RNN cell bias parameters
S = model.layers[1].get_weights()[1]  # RNN cell parameters for previous output (same layer)
T = model.layers[2].get_weights()[0]  # dense layer parameters for previous layer
U = model.layers[2].get_weights()[1]  # dense layer bias parameter

print(P.shape)
print(Q.shape)
print(R.shape)
print(S.shape)
print(T.shape)
print(U.shape)
(8, 3)
(3, 4)
(4,)
(4, 4)
(4, 1)
(1,)
In [4]:
# equivalent methods for making the prediction
print(model.predict(X))

from numpy import dot
from numpy import tanh
from numpy import exp
def sigmoid(x):
    return(1.0/(1.0 + exp(-x)))

print(
    sigmoid(dot(tanh(dot(P[6,:], Q) + R + dot(tanh(dot(P[4,:], Q) + R), S)), T) + U)
)

print(
    sigmoid(  tanh(P[6,0] * Q[0,0] + P[6,1] * Q[1,0] + P[6,2] * Q[2,0] + R[0]
                + tanh(P[4,0] * Q[0,0] + P[4,1] * Q[1,0] + P[4,2] * Q[2,0] + R[0]) * S[0,0]
                + tanh(P[4,0] * Q[0,1] + P[4,1] * Q[1,1] + P[4,2] * Q[2,1] + R[1]) * S[1,0]
                + tanh(P[4,0] * Q[0,2] + P[4,1] * Q[1,2] + P[4,2] * Q[2,2] + R[2]) * S[2,0]
                + tanh(P[4,0] * Q[0,3] + P[4,1] * Q[1,3] + P[4,2] * Q[2,3] + R[3]) * S[3,0]) * T[0,0]
            + tanh(P[6,0] * Q[0,1] + P[6,1] * Q[1,1] + P[6,2] * Q[2,1] + R[1]
                + tanh(P[4,0] * Q[0,0] + P[4,1] * Q[1,0] + P[4,2] * Q[2,0] + R[0]) * S[0,1]
                + tanh(P[4,0] * Q[0,1] + P[4,1] * Q[1,1] + P[4,2] * Q[2,1] + R[1]) * S[1,1]
                + tanh(P[4,0] * Q[0,2] + P[4,1] * Q[1,2] + P[4,2] * Q[2,2] + R[2]) * S[2,1]
                + tanh(P[4,0] * Q[0,3] + P[4,1] * Q[1,3] + P[4,2] * Q[2,3] + R[3]) * S[3,1]) * T[1,0]
            + tanh(P[6,0] * Q[0,2] + P[6,1] * Q[1,2] + P[6,2] * Q[2,2] + R[2]
                + tanh(P[4,0] * Q[0,0] + P[4,1] * Q[1,0] + P[4,2] * Q[2,0] + R[0]) * S[0,2]
                + tanh(P[4,0] * Q[0,1] + P[4,1] * Q[1,1] + P[4,2] * Q[2,1] + R[1]) * S[1,2]
                + tanh(P[4,0] * Q[0,2] + P[4,1] * Q[1,2] + P[4,2] * Q[2,2] + R[2]) * S[2,2]
                + tanh(P[4,0] * Q[0,3] + P[4,1] * Q[1,3] + P[4,2] * Q[2,3] + R[3]) * S[3,2]) * T[2,0]
            + tanh(P[6,0] * Q[0,3] + P[6,1] * Q[1,3] + P[6,2] * Q[2,3] + R[3]
                + tanh(P[4,0] * Q[0,0] + P[4,1] * Q[1,0] + P[4,2] * Q[2,0] + R[0]) * S[0,3]
                + tanh(P[4,0] * Q[0,1] + P[4,1] * Q[1,1] + P[4,2] * Q[2,1] + R[1]) * S[1,3]
                + tanh(P[4,0] * Q[0,2] + P[4,1] * Q[1,2] + P[4,2] * Q[2,2] + R[2]) * S[2,3]
                + tanh(P[4,0] * Q[0,3] + P[4,1] * Q[1,3] + P[4,2] * Q[2,3] + R[3]) * S[3,3]) * T[3,0]
            + U[0])
)
[[0.48414087]]
[0.48414087]
0.48414083321708756
In [5]:
# tanh() and sigmoid() functions involving Q[0,0]
f0 = tanh(P[4,0] * Q[0,0] + P[4,1] * Q[1,0] + P[4,2] * Q[2,0] + R[0])

f1 = tanh(P[6,0] * Q[0,0] + P[6,1] * Q[1,0] + P[6,2] * Q[2,0] + R[0]
          + tanh(P[4,0] * Q[0,0] + P[4,1] * Q[1,0] + P[4,2] * Q[2,0] + R[0]) * S[0,0]
          + tanh(P[4,0] * Q[0,1] + P[4,1] * Q[1,1] + P[4,2] * Q[2,1] + R[1]) * S[1,0]
          + tanh(P[4,0] * Q[0,2] + P[4,1] * Q[1,2] + P[4,2] * Q[2,2] + R[2]) * S[2,0]
          + tanh(P[4,0] * Q[0,3] + P[4,1] * Q[1,3] + P[4,2] * Q[2,3] + R[3]) * S[3,0])

f2 = tanh(P[6,0] * Q[0,1] + P[6,1] * Q[1,1] + P[6,2] * Q[2,1] + R[1]
          + tanh(P[4,0] * Q[0,0] + P[4,1] * Q[1,0] + P[4,2] * Q[2,0] + R[0]) * S[0,1]
          + tanh(P[4,0] * Q[0,1] + P[4,1] * Q[1,1] + P[4,2] * Q[2,1] + R[1]) * S[1,1]
          + tanh(P[4,0] * Q[0,2] + P[4,1] * Q[1,2] + P[4,2] * Q[2,2] + R[2]) * S[2,1]
          + tanh(P[4,0] * Q[0,3] + P[4,1] * Q[1,3] + P[4,2] * Q[2,3] + R[3]) * S[3,1])\

f3 = tanh(P[6,0] * Q[0,2] + P[6,1] * Q[1,2] + P[6,2] * Q[2,2] + R[2]
          + tanh(P[4,0] * Q[0,0] + P[4,1] * Q[1,0] + P[4,2] * Q[2,0] + R[0]) * S[0,2]
          + tanh(P[4,0] * Q[0,1] + P[4,1] * Q[1,1] + P[4,2] * Q[2,1] + R[1]) * S[1,2]
          + tanh(P[4,0] * Q[0,2] + P[4,1] * Q[1,2] + P[4,2] * Q[2,2] + R[2]) * S[2,2]
          + tanh(P[4,0] * Q[0,3] + P[4,1] * Q[1,3] + P[4,2] * Q[2,3] + R[3]) * S[3,2])

f4 = tanh(P[6,0] * Q[0,3] + P[6,1] * Q[1,3] + P[6,2] * Q[2,3] + R[3]
       + tanh(P[4,0] * Q[0,0] + P[4,1] * Q[1,0] + P[4,2] * Q[2,0] + R[0]) * S[0,3]
       + tanh(P[4,0] * Q[0,1] + P[4,1] * Q[1,1] + P[4,2] * Q[2,1] + R[1]) * S[1,3]
       + tanh(P[4,0] * Q[0,2] + P[4,1] * Q[1,2] + P[4,2] * Q[2,2] + R[2]) * S[2,3]
       + tanh(P[4,0] * Q[0,3] + P[4,1] * Q[1,3] + P[4,2] * Q[2,3] + R[3]) * S[3,3])

f5 = sigmoid(  tanh(P[6,0] * Q[0,0] + P[6,1] * Q[1,0] + P[6,2] * Q[2,0] + R[0]
                 + tanh(P[4,0] * Q[0,0] + P[4,1] * Q[1,0] + P[4,2] * Q[2,0] + R[0]) * S[0,0]
                 + tanh(P[4,0] * Q[0,1] + P[4,1] * Q[1,1] + P[4,2] * Q[2,1] + R[1]) * S[1,0]
                 + tanh(P[4,0] * Q[0,2] + P[4,1] * Q[1,2] + P[4,2] * Q[2,2] + R[2]) * S[2,0]
                 + tanh(P[4,0] * Q[0,3] + P[4,1] * Q[1,3] + P[4,2] * Q[2,3] + R[3]) * S[3,0]) * T[0,0]
             + tanh(P[6,0] * Q[0,1] + P[6,1] * Q[1,1] + P[6,2] * Q[2,1] + R[1]
                 + tanh(P[4,0] * Q[0,0] + P[4,1] * Q[1,0] + P[4,2] * Q[2,0] + R[0]) * S[0,1]
                 + tanh(P[4,0] * Q[0,1] + P[4,1] * Q[1,1] + P[4,2] * Q[2,1] + R[1]) * S[1,1]
                 + tanh(P[4,0] * Q[0,2] + P[4,1] * Q[1,2] + P[4,2] * Q[2,2] + R[2]) * S[2,1]
                 + tanh(P[4,0] * Q[0,3] + P[4,1] * Q[1,3] + P[4,2] * Q[2,3] + R[3]) * S[3,1]) * T[1,0]
             + tanh(P[6,0] * Q[0,2] + P[6,1] * Q[1,2] + P[6,2] * Q[2,2] + R[2]
                 + tanh(P[4,0] * Q[0,0] + P[4,1] * Q[1,0] + P[4,2] * Q[2,0] + R[0]) * S[0,2]
                 + tanh(P[4,0] * Q[0,1] + P[4,1] * Q[1,1] + P[4,2] * Q[2,1] + R[1]) * S[1,2]
                 + tanh(P[4,0] * Q[0,2] + P[4,1] * Q[1,2] + P[4,2] * Q[2,2] + R[2]) * S[2,2]
                 + tanh(P[4,0] * Q[0,3] + P[4,1] * Q[1,3] + P[4,2] * Q[2,3] + R[3]) * S[3,2]) * T[2,0]
             + tanh(P[6,0] * Q[0,3] + P[6,1] * Q[1,3] + P[6,2] * Q[2,3] + R[3]
                 + tanh(P[4,0] * Q[0,0] + P[4,1] * Q[1,0] + P[4,2] * Q[2,0] + R[0]) * S[0,3]
                 + tanh(P[4,0] * Q[0,1] + P[4,1] * Q[1,1] + P[4,2] * Q[2,1] + R[1]) * S[1,3]
                 + tanh(P[4,0] * Q[0,2] + P[4,1] * Q[1,2] + P[4,2] * Q[2,2] + R[2]) * S[2,3]
                 + tanh(P[4,0] * Q[0,3] + P[4,1] * Q[1,3] + P[4,2] * Q[2,3] + R[3]) * S[3,3]) * T[3,0]
             + U[0])
In [6]:
# gradients for tanh() and sigmoid() functions involving Q[0,0]
g6 = (f5 - Y) / (f5 * (1 - f5))
g5 = (f5 * (1 - f5))
g4 = (1 - f4 * f4)
g3 = (1 - f3 * f3)
g2 = (1 - f2 * f2)
g1 = (1 - f1 * f1)
g0 = (1 - f0 * f0)
In [7]:
# the gradient for Q[0,0]
gradient = \
g6 * g5 * T[0,0] * g1 * P[6,0] + \
g6 * g5 * T[0,0] * g1 * S[0,0] * g0 * P[4,0] + \
g6 * g5 * T[1,0] * g2 * S[0,1] * g0 * P[4,0] + \
g6 * g5 * T[2,0] * g3 * S[0,2] * g0 * P[4,0] + \
g6 * g5 * T[3,0] * g4 * S[0,3] * g0 * P[4,0]
In [8]:
# comparing results
model.fit(X, Y, epochs = 1, batch_size = 1)
print(model.layers[1].get_weights()[0] - Q)
print(format((model.layers[1].get_weights()[0] - Q)[0,0], ".7f"))
print(format(- K.eval(model.optimizer.lr) * gradient[0,0], ".7f"))
WARNING:tensorflow:From C:\Users\dadeb\Anaconda3\lib\site-packages\tensorflow\python\ops\math_ops.py:3066: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
Epoch 1/1
1/1 [==============================] - 1s 552ms/step - loss: 0.7254 - acc: 0.0000e+00
[[-3.13930213e-05  2.05814838e-04  2.00150535e-04 -1.54078007e-04]
 [-5.44786453e-05  2.56896019e-04  1.96754932e-04 -1.24067068e-04]
 [ 1.02996826e-04 -2.93105841e-04 -8.28206539e-05 -4.07695770e-05]]
-0.0000314
-0.0000314
In [9]:
model.predict(X)    # closer to target!
Out[9]:
array([[0.4986328]], dtype=float32)