Backpropagation with Convolution

2D convolution filters are stored in a 4D tensor:

  1. filter height
  2. filter width
  3. input channel count
  4. output channel count (filter count)
In [1]:
# create data

image_height = 5
image_width = 5
filter_height = 3
filter_width = 3

import numpy as np
np.set_printoptions(precision=6)
np0 = np.float32(0)
np1 = np.float32(1)

zeros = np.zeros((image_height, image_width), dtype = "float32")
cross  = np.array([
    [ 1, 0, 1 ],
    [ 0, 1, 0 ],
    [ 1, 0, 1 ]
    ], dtype = "float32")
nought = np.array([
    [ 1, 1, 1 ],
    [ 1, 0, 1 ],
    [ 1, 1, 1 ]
    ], dtype = "float32")

X = []
for template in [ cross, nought ]:
    for i in range(template.shape[0]):
        for j in range(template.shape[1]):
            temp = zeros.copy()
            temp[i:(i+template.shape[0]), j:(j+template.shape[1])] = template
            X.append(temp)

X = np.array(X).reshape((9 + 9, image_height, image_width, 1)).astype("float32")
Y = np.array(([1]*9) + ([0]*9)).astype("float32")

print("X[0]:\n", X[0,:,:,0])
print("Y[0]:", Y[0])
X[0]:
 [[1. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0.]
 [1. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
Y[0]: 1.0
In [2]:
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Conv2D, Dense, Flatten, MaxPooling2D
from tensorflow.keras.optimizers import SGD

# create model
model = Sequential()
model.add(Conv2D(1, (filter_height, filter_width), padding = "same", activation = "relu", input_shape = X.shape[1:4]))
model.add(Flatten())
model.add(Dense(1, activation = "sigmoid"))
model.summary()
old_weights = model.get_weights()

# forward propagation
conv_activations = Model(model.input, model.layers[0].output)
conv_out = conv_activations.predict(X[:1])[0,:,:,0]
pred_out = model.predict(X[:1])[0,0]

# manual convolution
padded = np.zeros((X.shape[1] + 2, X.shape[2] + 2), dtype = "float32")
padded[1:-1,1:-1] = X[0,:,:,0]
conv_out_manual = np.zeros((X.shape[1], X.shape[2]), dtype = "float32")
for i in range(X.shape[1]):
    for j in range(X.shape[2]):
        conv_out_manual[i,j] = np.sum(padded[i:(i+filter_width), j:(j+filter_height)] * old_weights[0][:,:,0,0]) \
                             + old_weights[1][0]

# manual activation
conv_out_manual = np.where(conv_out_manual > 0, conv_out_manual, np.float32(0))
pd_relu_conv = np.where(conv_out_manual > 0, np1, np0)
log_odds = np.dot(conv_out_manual.reshape(X.shape[1] * X.shape[2]), old_weights[2][:,0]) + old_weights[3][0]
pred_out_manual = np1/(np1 + np.exp(- log_odds))
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
conv2d (Conv2D)              (None, 5, 5, 1)           10        
_________________________________________________________________
flatten (Flatten)            (None, 25)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 26        
=================================================================
Total params: 36
Trainable params: 36
Non-trainable params: 0
_________________________________________________________________
In [3]:
print("conv output:\n", conv_out)
print("conv manual:\n", conv_out_manual)
print("partial derivative of relu activation with respect to convolution:\n", pd_relu_conv)
conv output:
 [[0.046522 0.013234 0.726592 0.       0.      ]
 [0.62291  0.396992 0.572069 0.83382  0.      ]
 [0.       0.       0.578023 0.       0.      ]
 [0.381381 0.       0.381381 0.342625 0.      ]
 [0.       0.       0.       0.       0.      ]]
conv manual:
 [[0.046522 0.013234 0.726592 0.       0.      ]
 [0.62291  0.396992 0.572069 0.83382  0.      ]
 [0.       0.       0.578023 0.       0.      ]
 [0.381381 0.       0.381381 0.342625 0.      ]
 [0.       0.       0.       0.       0.      ]]
partial derivative of relu activation with respect to convolution:
 [[1. 1. 1. 0. 0.]
 [1. 1. 1. 1. 0.]
 [0. 0. 1. 0. 0.]
 [1. 0. 1. 1. 0.]
 [0. 0. 0. 0. 0.]]
In [4]:
print("prediction output:", pred_out)
print("prediction manual:", pred_out_manual)
prediction output: 0.32857916
prediction manual: 0.32857916
In [5]:
# train on first example
model.compile(loss = "binary_crossentropy", optimizer = SGD(lr = 0.01), metrics = [ "accuracy" ])
model.fit(X[:1], Y[:1])
new_weights = model.get_weights()
Train on 1 samples
1/1 [==============================] - 0s 312ms/sample - loss: 1.1130 - accuracy: 0.0000e+00
In [6]:
# manual convolution weight update
pd_loss_logodds = ((pred_out - Y[0]) / (pred_out * (np1 - pred_out))) * ((pred_out) * (np1 - pred_out))
pd_loss_wgt = np.zeros((3, 3), dtype = "float32")
for w_i in range(filter_height):
    for w_j in range(filter_width):
        for i in range(X.shape[1]):
            for j in range(X.shape[2]):
                pd_loss_wgt[w_i,w_j] += (pd_loss_logodds \
                                         * old_weights[2][X.shape[2]*i+j] \
                                         * pd_relu_conv[i,j] \
                                         * padded[i:(i+filter_height), j:(j+filter_width)][w_i,w_j])

print("weight update:\n", new_weights[0][:,:,0,0] - old_weights[0][:,:,0,0])
print("weight manual:\n", - 0.01 * pd_loss_wgt)
weight update:
 [[-0.004644  0.000785 -0.001612]
 [-0.001374 -0.005766 -0.004497]
 [-0.006492 -0.003035 -0.002225]]
weight manual:
 [[-0.004644  0.000785 -0.001612]
 [-0.001374 -0.005766 -0.004497]
 [-0.006492 -0.003035 -0.002225]]