基于矩阵的计算
上面一章的 `update_mini_batch` 函数是每个数据集独立计算后向传播,后面再做平均值,
def update_mini_batch(self, mini_batch, eta):
"""Update the network's weights and biases by applying
gradient descent using backpropagation to a single mini batch.
The ``mini_batch`` is a list of tuples ``(x, y)``, and ``eta``
is the learning rate."""
# 初始化梯度值矩阵为 0
# Nabla算子,在中文中也叫向量微分算子、劈形算子、倒三角算子
nabla_b = [np.zeros(b.shape) for b in self.biases]
nabla_w = [np.zeros(w.shape) for w in self.weights]
for x, y in mini_batch:
# 迭代计算梯度矩阵和
# 获取当前样本通过反向传播算法得到的 delta 梯度值
delta_nabla_b, delta_nabla_w = self.backprop(x, y)
# 把 mini_batch 里面每个数据算出来的梯度做加和,后面再取平均
nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
# 把梯度值取平均,并乘以系数 eta,然后更新权重和偏置矩阵
self.weights = [w-(eta/len(mini_batch))*nw for w, nw in zip(self.weights, nabla_w)]
self.biases = [b-(eta/len(mini_batch))*nb for b, nb in zip(self.biases, nabla_b)]
这里介绍一下如何通过矩阵的方式直接计算一个 mini batch 的梯度值向量。代码来源:https://github.com/hindSellouk/Matrix-Based-Backpropagation/blob/master/Network1.py
import random
from os import access
import numpy as np
import time
class Network(object):
def __init__(self, sizes):
self.num_layers = len(sizes)
self.sizes = sizes
self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
self.weights = [np.random.randn(y, x)
for x, y in zip(sizes[:-1], sizes[1:])]
def SGD(self, training_data, epochs, mini_batch_size, eta,
test_data=None):
if test_data: n_test = len(test_data)
n = len(training_data)
for j in xrange(epochs):
start_time = time.time()
random.shuffle(training_data)
mini_batches = [
training_data[k:k+mini_batch_size]
for k in xrange(0, n, mini_batch_size)]
for mini_batch in mini_batches:
self.update_mini_batch(mini_batch, eta)
print("--- %s seconds elapsed---" % (time.time() - start_time))
if test_data:
print "Epoch {0}: {1} / {2}".format(
j, self.evaluate(test_data), n_test)
else:
print "Epoch {0} complete".format(j)
def update_mini_batch(self, mini_batch, eta):
# mini batch 是一个 list,list 里面每个元素是一个 tuple (x, y)
matrix_X=mini_batch[0][0]
matrix_Y=mini_batch[0][1]
#create matrix_X of examples and a matrix_Y of labels
for x,y in mini_batch[1:]:
# 将 mini batch 里面的每个数据拼接起来,成一个 matrix
matrix_X = np.concatenate((matrix_X,x), axis=1)
matrix_Y = np.concatenate((matrix_Y,y), axis=1)
nabla_b, nabla_w = self.backprop(matrix_X,matrix_Y)
# 下面将返回的权重和偏置梯度值做平均
self.weights = [w - (eta / len(mini_batch)) * nw
for w, nw in zip(self.weights, nabla_w)]
self.biases = [b - (eta / len(mini_batch)) * nb
for b, nb in zip(self.biases, nabla_b) ]
def backprop(self, x, y):
# 这里 x 和 y 就都是一个 matrix,包括一个 batch 大小的数据
nabla_b = [np.zeros(b.shape) for b in self.biases]
nabla_w = [np.zeros(w.shape) for w in self.weights]
# feedforward
activation = x
activations = [x] # list to store all the activation matrices, layer by layer
zs = [] # list to store all the "sum of weighted inputs z" matrices, layer by layer
i=0
for b, w in zip(self.biases, self.weights):
z = np.dot(w, activation) + b
zs.append(z)
activation = sigmoid(z)
activations.append(activation)
# 这里 delta 就是一个矩阵,如果 batch 为 10 大小的话
# ('backprop delta', (10, 10), <type 'numpy.ndarray'>)
delta = self.cost_derivative(activations[-1], y) * \
sigmoid_prime(zs[-1])
# 这里先将上面的 delta 矩阵做了加和,变成了
# (10,), <type 'numpy.ndarray'>)
# 然后做了扩展,变成如下
# ('nabla_b[-1]', (10, 1), <type 'numpy.ndarray'>)
nabla_b[-1] = np.expand_dims(np.sum(delta,axis=1),axis=1)
# 最后一层的权重矩阵
# ('nabla_w[-1]', (10, 30), <type 'numpy.ndarray'>)
# 这里 np.dot 将 batch 的所有权重做了线性和
nabla_w[-1] = np.dot(delta, activations[-2].transpose())
for l in xrange(2, self.num_layers):
z = zs[-l]
sp = sigmoid_prime(z)
# 10 * 10
delta = np.dot(self.weights[-l + 1].transpose(), delta) * sp
# 这里 np.sum 将 batch 的所有误差做了线性和
nabla_b[-l] = np.expand_dims(np.sum(delta,axis=1),axis=1)
# 这里 np.dot 将 batch 的所有权重做了线性和
# 原来 delta 是 n * 1, activations[-l - 1].transpose() 是 1 * m
# batch 的情况下,delta 是 n * batch size, activations[-l - 1].transpose() 是 batch size * m
nabla_w[-l] = np.dot(delta, activations[-l - 1].transpose())
return (nabla_b, nabla_w)
def feedforward(self, a):
for b, w in zip(self.biases, self.weights):
a = sigmoid(np.dot(w, a)+b)
return a
def evaluate(self, test_data):
test_results = [(np.argmax(self.feedforward(x)), y)
for (x, y) in test_data]
return sum(int(x == y) for (x, y) in test_results)
def cost_derivative(self, output_activations, y):
return (output_activations-y)
def sigmoid(z):
return 1.0/(1.0+np.exp(-z))
def sigmoid_prime(z):
return sigmoid(z)*(1-sigmoid(z))
执行代码:
import mnist_loader
training_data, validation_data, test_data = mnist_loader.load_data_wrapper()
import network1
net = network1.Network([784, 30, 10])
from datetime import datetime as datetime
print(datetime.now())
net.SGD(training_data, 30, 10, 3.0, test_data=test_data);
print(datetime.now())
基于矩阵运算的算法一个 Epoch 大约为 3 秒:
--- 3.12179398537 seconds elapsed---
Epoch 0: 9061 / 10000
--- 3.03969407082 seconds elapsed---
Epoch 1: 9177 / 10000
--- 3.00676393509 seconds elapsed---
Epoch 2: 9261 / 10000
不基于矩阵运算的算法一个 Epoch 大约为 11 秒:
--- 11.0207378864 seconds elapsed---
Epoch 0: 9079 / 10000
--- 11.092361927 seconds elapsed---
Epoch 1: 9247 / 10000
--- 11.0678529739 seconds elapsed---
Epoch 2: 9326 / 10000
Last updated