# 1. 参数的更新

1. SGD：梯度下降法，需要更新的权重参数记为W，用学习率乘以损失函数关于W的梯度去不断更新权重。
2. Momentum：与上面SGD类似，但是多了两个参数，v表示物理上的速度，a是一个参数，比如0.9等，对应地面摩擦或空气阻力等。

## 1.1 SGD

SGD类的实现代码如下：

``````class SGD:

def __init__(self, lr=0.01):
self.lr = lr

def update(self, params, grads):
for key in params.keys():
params[key] -= self.lr * grads[key]
``````

``````network = TwoLayerNet(...)
optimizer = SGD()

for i in range(10000):
...
x_batch, t_batch = get_mini_batch(...) # mini-batch
params = network.params
...
``````

SGD的缺点：

SGD低效的根本原因是，梯度的方向没有指向最小值的方向，如下图：

## 1.2 Momentum

Momentum是动量的意思，与物理有关，实现代码如下：

``````class Momentum:

"""Momentum SGD"""

def __init__(self, lr=0.01, momentum=0.9):
self.lr = lr
self.momentum = momentum
self.v = None

def update(self, params, grads):
if self.v is None:
self.v = {}
for key, val in params.items():
self.v[key] = np.zeros_like(val)

for key in params.keys():
self.v[key] = self.momentum*self.v[key] - self.lr*grads[key]
params[key] += self.v[key]
``````

``````class AdaGrad:

def __init__(self, lr=0.01):
self.lr = lr
self.h = None

def update(self, params, grads):
if self.h is None:
self.h = {}
for key, val in params.items():
self.h[key] = np.zeros_like(val)

for key in params.keys():
params[key] -= self.lr * grads[key] / (np.sqrt(self.h[key]) + 1e-7)
``````

``````class Adam:

def __init__(self, lr=0.001, beta1=0.9, beta2=0.999):
self.lr = lr
self.beta1 = beta1
self.beta2 = beta2
self.iter = 0
self.m = None
self.v = None

def update(self, params, grads):
if self.m is None:
self.m, self.v = {}, {}
for key, val in params.items():
self.m[key] = np.zeros_like(val)
self.v[key] = np.zeros_like(val)

self.iter += 1
lr_t  = self.lr * np.sqrt(1.0 - self.beta2**self.iter) / (1.0 - self.beta1**self.iter)

for key in params.keys():
#self.m[key] = self.beta1*self.m[key] + (1-self.beta1)*grads[key]
#self.v[key] = self.beta2*self.v[key] + (1-self.beta2)*(grads[key]**2)
self.m[key] += (1 - self.beta1) * (grads[key] - self.m[key])
self.v[key] += (1 - self.beta2) * (grads[key]**2 - self.v[key])

params[key] -= lr_t * self.m[key] / (np.sqrt(self.v[key]) + 1e-7)

#unbias_m += (1 - self.beta1) * (grads[key] - self.m[key]) # correct bias
#unbisa_b += (1 - self.beta2) * (grads[key]*grads[key] - self.v[key]) # correct bias
#params[key] += self.lr * unbias_m / (np.sqrt(unbisa_b) + 1e-7)

``````

## 1.5 四种方式的比较

``````# coding: utf-8
import sys, os
sys.path.append(os.pardir)  # 为了导入父目录的文件而进行的设定
import numpy as np
import matplotlib.pyplot as plt
from collections import OrderedDict
from common.optimizer import *

def f(x, y):
return x**2 / 20.0 + y**2

def df(x, y):
return x / 10.0, 2.0*y

init_pos = (-7.0, 2.0)
params = {}
params['x'], params['y'] = init_pos[0], init_pos[1]

optimizers = OrderedDict()
optimizers["SGD"] = SGD(lr=0.95)
optimizers["Momentum"] = Momentum(lr=0.1)

idx = 1

for key in optimizers:
optimizer = optimizers[key]
x_history = []
y_history = []
params['x'], params['y'] = init_pos[0], init_pos[1]

for i in range(30):
x_history.append(params['x'])
y_history.append(params['y'])

x = np.arange(-10, 10, 0.01)
y = np.arange(-5, 5, 0.01)

X, Y = np.meshgrid(x, y)
Z = f(X, Y)

# for simple contour line
mask = Z > 7

# plot
plt.subplot(2, 2, idx)
idx += 1
plt.plot(x_history, y_history, 'o-', color="red")
plt.contour(X, Y, Z)
plt.ylim(-10, 10)
plt.xlim(-10, 10)
plt.plot(0, 0, '+')
#colorbar()
#spring()
plt.title(key)
plt.xlabel("x")
plt.ylabel("y")

plt.show()
``````

## 1.6 基于MNIST数据集的更新方法的比

``````# coding: utf-8
import os
import sys
sys.path.append(os.pardir)  # 为了导入父目录的文件而进行的设定
import matplotlib.pyplot as plt
from dataset.mnist import load_mnist
from common.util import smooth_curve
from common.multi_layer_net import MultiLayerNet
from common.optimizer import *

# 0:读入MNIST数据==========
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True)

train_size = x_train.shape[0]
batch_size = 128
max_iterations = 2000

# 1:进行实验的设置==========
optimizers = {}
optimizers['SGD'] = SGD()
optimizers['Momentum'] = Momentum()
#optimizers['RMSprop'] = RMSprop()

networks = {}
train_loss = {}
for key in optimizers.keys():
networks[key] = MultiLayerNet(
input_size=784, hidden_size_list=[100, 100, 100, 100],
output_size=10)
train_loss[key] = []

# 2:开始训练==========
for i in range(max_iterations):
batch_mask = np.random.choice(train_size, batch_size)

for key in optimizers.keys():

loss = networks[key].loss(x_batch, t_batch)
train_loss[key].append(loss)

if i % 100 == 0:
print( "===========" + "iteration:" + str(i) + "===========")
for key in optimizers.keys():
loss = networks[key].loss(x_batch, t_batch)
print(key + ":" + str(loss))

# 3.绘制图形==========
markers = {"SGD": "o", "Momentum": "x", "AdaGrad": "s", "Adam": "D"}
x = np.arange(max_iterations)
for key in optimizers.keys():
plt.plot(x, smooth_curve(train_loss[key]), marker=markers[key], markevery=100, label=key)
plt.xlabel("iterations")
plt.ylabel("loss")
plt.ylim(0, 1)
plt.legend()
plt.show()

``````
``````===========iteration:0===========
Momentum:2.36538069223
SGD:2.42815475142

......

===========iteration:1800===========
Momentum:0.0814914810944
SGD:0.193988179678
===========iteration:1900===========
Momentum:0.0479927674446
SGD:0.227371311446
``````

# 2. 权重的初始值

## 2.2 隐藏层的激活值分布

``````# coding: utf-8
import numpy as np
import matplotlib.pyplot as plt

def sigmoid(x):
return 1 / (1 + np.exp(-x))

def ReLU(x):
return np.maximum(0, x)

def tanh(x):
return np.tanh(x)

input_data = np.random.randn(1000, 100)  # 1000个数据
node_num = 100  # 各隐藏层的节点（神经元）数
hidden_layer_size = 5  # 隐藏层有5层
activations = {}  # 激活值的结果保存在这里

x = input_data

for i in range(hidden_layer_size):
if i != 0:
x = activations[i-1]

# 改变初始值进行实验！
w = np.random.randn(node_num, node_num) * 1
# w = np.random.randn(node_num, node_num) * 0.01
# w = np.random.randn(node_num, node_num) * np.sqrt(1.0 / node_num)
# w = np.random.randn(node_num, node_num) * np.sqrt(2.0 / node_num)

a = np.dot(x, w)

# 将激活函数的种类也改变，来进行实验！
z = sigmoid(a)
# z = ReLU(a)
# z = tanh(a)

activations[i] = z

# 绘制直方图
for i, a in activations.items():
plt.subplot(1, len(activations), i+1)
plt.title(str(i+1) + "-layer")
if i != 0: plt.yticks([], [])
# plt.xlim(0.1, 1)
# plt.ylim(0, 7000)
plt.hist(a.flatten(), 30, range=(0,1))
plt.show()

``````

• 标准差修改为0.01时的分布：

• 标准差修改为0.1时的分布：

Xavier初始值：

Xavier的论文中，为了使各层的激活值呈现具有相同广度的分布，推导了合适的权重尺度，结论是，如果前一层的节点数为n，则初始值用标准差为1/sqrt(t)的分布，修改代码如下：

``````w = np.random.randn(node_num, node_num) * np.sqrt(1.0 / node_num)
``````

## 2.3 ReLU权重初始值

• 标准差0.01

• 标准差 Xavier初始值

• 标准差 He初始值（标准差为 sqrt(2/n)）

He初始值使用的代码为：

``````w = np.random.randn(node_num, node_num) * np.sqrt(2.0 / node_num)
``````

1. 当激活函数为sigmoid或tanh等S型曲线函数时，初始值使用Xavier初始值
2. 当激活函数为ReLU时，权重初始值使用He初始值

## 2.4 基于MNIST数据集的权重初始值的比较

``````# coding: utf-8
import os
import sys

sys.path.append(os.pardir)  # 为了导入父目录的文件而进行的设定
import numpy as np
import matplotlib.pyplot as plt
from dataset.mnist import load_mnist
from common.util import smooth_curve
from common.multi_layer_net import MultiLayerNet
from common.optimizer import SGD

# 0:读入MNIST数据==========
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True)

train_size = x_train.shape[0]
batch_size = 128
max_iterations = 2000

# 1:进行实验的设置==========
weight_init_types = {'std=0.01': 0.01, 'Xavier': 'sigmoid', 'He': 'relu'}
optimizer = SGD(lr=0.01)

networks = {}
train_loss = {}
for key, weight_type in weight_init_types.items():
networks[key] = MultiLayerNet(input_size=784, hidden_size_list=[100, 100, 100, 100],
output_size=10, weight_init_std=weight_type)
train_loss[key] = []

# 2:开始训练==========
for i in range(max_iterations):
batch_mask = np.random.choice(train_size, batch_size)

for key in weight_init_types.keys():

loss = networks[key].loss(x_batch, t_batch)
train_loss[key].append(loss)

if i % 100 == 0:
print("===========" + "iteration:" + str(i) + "===========")
for key in weight_init_types.keys():
loss = networks[key].loss(x_batch, t_batch)
print(key + ":" + str(loss))

# 3.绘制图形==========
markers = {'std=0.01': 'o', 'Xavier': 's', 'He': 'D'}
x = np.arange(max_iterations)
for key in weight_init_types.keys():
plt.plot(x, smooth_curve(train_loss[key]), marker=markers[key], markevery=100, label=key)
plt.xlabel("iterations")
plt.ylabel("loss")
plt.ylim(0, 2.5)
plt.legend()
plt.show()
``````
``````===========iteration:0===========
He:2.37977141155
Xavier:2.30635119752
std=0.01:2.30253529702

...

===========iteration:1800===========
He:0.166992197315
Xavier:0.274103348063
std=0.01:2.30133710348
===========iteration:1900===========
He:0.299547360141
Xavier:0.388699873789
std=0.01:2.29987822777

``````

# 3. Batch Normalization

## 3.1 Batch Normalization 的算法

1. 可以使学习快速进行(可以增大学习率)
2. 不那么依赖初始值
3. 抑制过拟合(降低Dropout等的必要性)

1. ε是一个微小值，防止除以0的情况
2. γ和β是参数。一开始γ = 1， β = 0，然后再通过学习调整到合 适的值。

## 3.2 Batch Normalization的评估

``````# coding: utf-8
import sys, os
sys.path.append(os.pardir)  # 为了导入父目录的文件而进行的设定
import numpy as np
import matplotlib.pyplot as plt
from dataset.mnist import load_mnist
from common.multi_layer_net_extend import MultiLayerNetExtend
from common.optimizer import SGD, Adam

(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True)

# 减少学习数据
x_train = x_train[:1000]
t_train = t_train[:1000]

max_epochs = 20
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.01

def __train(weight_init_std):
bn_network = MultiLayerNetExtend(input_size=784, hidden_size_list=[100, 100, 100, 100, 100], output_size=10,
weight_init_std=weight_init_std, use_batchnorm=True)
network = MultiLayerNetExtend(input_size=784, hidden_size_list=[100, 100, 100, 100, 100], output_size=10,
weight_init_std=weight_init_std)
optimizer = SGD(lr=learning_rate)

train_acc_list = []
bn_train_acc_list = []

iter_per_epoch = max(train_size / batch_size, 1)
epoch_cnt = 0

for i in range(1000000000):
batch_mask = np.random.choice(train_size, batch_size)

for _network in (bn_network, network):

if i % iter_per_epoch == 0:
train_acc = network.accuracy(x_train, t_train)
bn_train_acc = bn_network.accuracy(x_train, t_train)
train_acc_list.append(train_acc)
bn_train_acc_list.append(bn_train_acc)

#print("epoch:" + str(epoch_cnt) + " | " + str(train_acc) + " - " + str(bn_train_acc))

epoch_cnt += 1
if epoch_cnt >= max_epochs:
break

return train_acc_list, bn_train_acc_list

# 3.绘制图形==========
weight_scale_list = np.logspace(0, -4, num=16)
x = np.arange(max_epochs)

for i, w in enumerate(weight_scale_list):
print( "============== " + str(i+1) + "/16" + " ==============")
train_acc_list, bn_train_acc_list = __train(w)

plt.subplot(4,4,i+1)
plt.title("W:" + str(w))
if i == 15:
plt.plot(x, bn_train_acc_list, label='Batch Normalization', markevery=2)
plt.plot(x, train_acc_list, linestyle = "--", label='Normal(without BatchNorm)', markevery=2)
else:
plt.plot(x, bn_train_acc_list, markevery=2)
plt.plot(x, train_acc_list, linestyle="--", markevery=2)

plt.ylim(0, 1.0)
if i % 4:
plt.yticks([])
else:
plt.ylabel("accuracy")
if i < 12:
plt.xticks([])
else:
plt.xlabel("epochs")
plt.legend(loc='lower right')

plt.show()
``````

1. 使用Batch Norm后，学习更快了
2. 另外，不合适的初始值时，没有Batch Norm几乎无法学习

``````bn_network = MultiLayerNetExtend(input_size=784, hidden_size_list=[100, 100, 100, 100, 100], output_size=10,
weight_init_std=weight_init_std, use_batchnorm=True)
``````

## 3.3 MultiLayerNetExtend类

MultiLayerNetExtend类的代码示例如下：

``````# coding: utf-8
import sys, os
sys.path.append(os.pardir) # 为了导入父目录的文件而进行的设定
import numpy as np
from collections import OrderedDict
from common.layers import *

class MultiLayerNetExtend:
"""扩展版的全连接的多层神经网络

具有Weiht Decay、Dropout、Batch Normalization的功能

Parameters
----------
input_size : 输入大小（MNIST的情况下为784）
hidden_size_list : 隐藏层的神经元数量的列表（e.g. [100, 100, 100]）
output_size : 输出大小（MNIST的情况下为10）
activation : 'relu' or 'sigmoid'
weight_init_std : 指定权重的标准差（e.g. 0.01）
指定'relu'或'he'的情况下设定“He的初始值”
指定'sigmoid'或'xavier'的情况下设定“Xavier的初始值”
weight_decay_lambda : Weight Decay（L2范数）的强度
use_dropout: 是否使用Dropout
dropout_ration : Dropout的比例
use_batchNorm: 是否使用Batch Normalization
"""
def __init__(self, input_size, hidden_size_list, output_size,
activation='relu', weight_init_std='relu', weight_decay_lambda=0,
use_dropout = False, dropout_ration = 0.5, use_batchnorm=False):
self.input_size = input_size
self.output_size = output_size
self.hidden_size_list = hidden_size_list
self.hidden_layer_num = len(hidden_size_list)
self.use_dropout = use_dropout
self.weight_decay_lambda = weight_decay_lambda
self.use_batchnorm = use_batchnorm
self.params = {}

# 初始化权重
self.__init_weight(weight_init_std)

# 生成层
activation_layer = {'sigmoid': Sigmoid, 'relu': Relu}
self.layers = OrderedDict()
for idx in range(1, self.hidden_layer_num+1):
self.layers['Affine' + str(idx)] = Affine(self.params['W' + str(idx)],
self.params['b' + str(idx)])
if self.use_batchnorm:
self.params['gamma' + str(idx)] = np.ones(hidden_size_list[idx-1])
self.params['beta' + str(idx)] = np.zeros(hidden_size_list[idx-1])
self.layers['BatchNorm' + str(idx)] = BatchNormalization(self.params['gamma' + str(idx)], self.params['beta' + str(idx)])

self.layers['Activation_function' + str(idx)] = activation_layer[activation]()

if self.use_dropout:
self.layers['Dropout' + str(idx)] = Dropout(dropout_ration)

idx = self.hidden_layer_num + 1
self.layers['Affine' + str(idx)] = Affine(self.params['W' + str(idx)], self.params['b' + str(idx)])

self.last_layer = SoftmaxWithLoss()

def __init_weight(self, weight_init_std):
"""设定权重的初始值

Parameters
----------
weight_init_std : 指定权重的标准差（e.g. 0.01）
指定'relu'或'he'的情况下设定“He的初始值”
指定'sigmoid'或'xavier'的情况下设定“Xavier的初始值”
"""
all_size_list = [self.input_size] + self.hidden_size_list + [self.output_size]
for idx in range(1, len(all_size_list)):
scale = weight_init_std
if str(weight_init_std).lower() in ('relu', 'he'):
scale = np.sqrt(2.0 / all_size_list[idx - 1])  # 使用ReLU的情况下推荐的初始值
elif str(weight_init_std).lower() in ('sigmoid', 'xavier'):
scale = np.sqrt(1.0 / all_size_list[idx - 1])  # 使用sigmoid的情况下推荐的初始值
self.params['W' + str(idx)] = scale * np.random.randn(all_size_list[idx-1], all_size_list[idx])
self.params['b' + str(idx)] = np.zeros(all_size_list[idx])

def predict(self, x, train_flg=False):
for key, layer in self.layers.items():
if "Dropout" in key or "BatchNorm" in key:
x = layer.forward(x, train_flg)
else:
x = layer.forward(x)

return x

def loss(self, x, t, train_flg=False):
"""求损失函数
参数x是输入数据，t是教师标签
"""
y = self.predict(x, train_flg)

weight_decay = 0
for idx in range(1, self.hidden_layer_num + 2):
W = self.params['W' + str(idx)]
weight_decay += 0.5 * self.weight_decay_lambda * np.sum(W**2)

return self.last_layer.forward(y, t) + weight_decay

def accuracy(self, X, T):
Y = self.predict(X, train_flg=False)
Y = np.argmax(Y, axis=1)
if T.ndim != 1 : T = np.argmax(T, axis=1)

accuracy = np.sum(Y == T) / float(X.shape[0])
return accuracy

def numerical_gradient(self, X, T):
"""求梯度（数值微分）

Parameters
----------
X : 输入数据
T : 教师标签

Returns
-------
具有各层的梯度的字典变量
"""
loss_W = lambda W: self.loss(X, T, train_flg=True)

for idx in range(1, self.hidden_layer_num+2):
grads['W' + str(idx)] = numerical_gradient(loss_W, self.params['W' + str(idx)])
grads['b' + str(idx)] = numerical_gradient(loss_W, self.params['b' + str(idx)])

if self.use_batchnorm and idx != self.hidden_layer_num+1:
grads['gamma' + str(idx)] = numerical_gradient(loss_W, self.params['gamma' + str(idx)])
grads['beta' + str(idx)] = numerical_gradient(loss_W, self.params['beta' + str(idx)])

def gradient(self, x, t):
# forward
self.loss(x, t, train_flg=True)

# backward
dout = 1
dout = self.last_layer.backward(dout)

layers = list(self.layers.values())
layers.reverse()
for layer in layers:
dout = layer.backward(dout)

# 设定
for idx in range(1, self.hidden_layer_num+2):
grads['W' + str(idx)] = self.layers['Affine' + str(idx)].dW + self.weight_decay_lambda * self.params['W' + str(idx)]
grads['b' + str(idx)] = self.layers['Affine' + str(idx)].db

if self.use_batchnorm and idx != self.hidden_layer_num+1:
grads['gamma' + str(idx)] = self.layers['BatchNorm' + str(idx)].dgamma
grads['beta' + str(idx)] = self.layers['BatchNorm' + str(idx)].dbeta

``````

# 4. 正则化

## 4.1 过拟合

1. 模型有大量的参数，表现力强(神经网络复杂，神经元多，网络层数多)
2. 训练数据少

``````# coding: utf-8
import os
import sys

sys.path.append(os.pardir)  # 为了导入父目录的文件而进行的设定
import numpy as np
import matplotlib.pyplot as plt
from dataset.mnist import load_mnist
from common.multi_layer_net import MultiLayerNet
from common.optimizer import SGD

(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True)

# 为了再现过拟合，减少学习数据
x_train = x_train[:300]
t_train = t_train[:300]

# weight decay（权值衰减）的设定 =======================
weight_decay_lambda = 0 # 不使用权值衰减的情况
#weight_decay_lambda = 0.1
# ====================================================

network = MultiLayerNet(input_size=784, hidden_size_list=[100, 100, 100, 100, 100, 100], output_size=10,
weight_decay_lambda=weight_decay_lambda)
optimizer = SGD(lr=0.01)

max_epochs = 201
train_size = x_train.shape[0]
batch_size = 100

train_loss_list = []
train_acc_list = []
test_acc_list = []

iter_per_epoch = max(train_size / batch_size, 1)
epoch_cnt = 0

for i in range(1000000000):
batch_mask = np.random.choice(train_size, batch_size)

if i % iter_per_epoch == 0:
train_acc = network.accuracy(x_train, t_train)
test_acc = network.accuracy(x_test, t_test)
train_acc_list.append(train_acc)
test_acc_list.append(test_acc)

print("epoch:" + str(epoch_cnt) + ", train acc:" + str(train_acc) + ", test acc:" + str(test_acc))

epoch_cnt += 1
if epoch_cnt >= max_epochs:
break

# 3.绘制图形==========
markers = {'train': 'o', 'test': 's'}
x = np.arange(max_epochs)
plt.plot(x, train_acc_list, marker='o', label='train', markevery=10)
plt.plot(x, test_acc_list, marker='s', label='test', markevery=10)
plt.xlabel("epochs")
plt.ylabel("accuracy")
plt.ylim(0, 1.0)
plt.legend(loc='lower right')
plt.show()
``````

## 4.2 权值衰减

``````# weight decay（权值衰减）的设定 =======================
#weight_decay_lambda = 0 # 不使用权值衰减的情况
weight_decay_lambda = 0.1 # 使用权值衰减的情况

network = MultiLayerNet(input_size=784, hidden_size_list=[100, 100, 100, 100, 100, 100], output_size=10,
weight_decay_lambda=weight_decay_lambda)

``````

MultiLayerNet类中，关于loss函数的实现如下：

1. weight_decay_lambda 为0 的时候，下面的函数中weight_decay也为0，所以相当于没有变化。
2. weight_decay_lambda为0.1时，最终会加上weight_decay，这个值在反向传播求导后，结果为(-W)，相当于减去一个值。
``````    def loss(self, x, t, train_flg=False):
"""求损失函数
参数x是输入数据，t是教师标签
"""
y = self.predict(x, train_flg)

weight_decay = 0
for idx in range(1, self.hidden_layer_num + 2):
W = self.params['W' + str(idx)]
weight_decay += 0.5 * self.weight_decay_lambda * np.sum(W**2)

return self.last_layer.forward(y, t) + weight_decay
``````

## 4.3 Dropout

Dropout是一种在学习的过程中随机删除神经元的方法，训练时，随机选出隐藏层的神经元，然后将其删除，被删除的神经元不再进行信号的传递。

``````# coding: utf-8
import os
import sys
sys.path.append(os.pardir)  # 为了导入父目录的文件而进行的设定
import numpy as np
import matplotlib.pyplot as plt
from dataset.mnist import load_mnist
from common.multi_layer_net_extend import MultiLayerNetExtend
from common.trainer import Trainer

(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True)

# 为了再现过拟合，减少学习数据
x_train = x_train[:300]
t_train = t_train[:300]

# 设定是否使用Dropuout，以及比例 ========================
use_dropout = True  # 不使用Dropout的情况下为False
dropout_ratio = 0.2
# ====================================================

network = MultiLayerNetExtend(input_size=784, hidden_size_list=[100, 100, 100, 100, 100, 100],
output_size=10, use_dropout=use_dropout, dropout_ration=dropout_ratio)
trainer = Trainer(network, x_train, t_train, x_test, t_test,
epochs=301, mini_batch_size=100,
optimizer='sgd', optimizer_param={'lr': 0.01}, verbose=True)
trainer.train()

train_acc_list, test_acc_list = trainer.train_acc_list, trainer.test_acc_list

# 绘制图形==========
markers = {'train': 'o', 'test': 's'}
x = np.arange(len(train_acc_list))
plt.plot(x, train_acc_list, marker='o', label='train', markevery=10)
plt.plot(x, test_acc_list, marker='s', label='test', markevery=10)
plt.xlabel("epochs")
plt.ylabel("accuracy")
plt.ylim(0, 1.0)
plt.legend(loc='lower right')
plt.show()
``````

### 1. Dropout的实现代码：

``````class Dropout:
"""
http://arxiv.org/abs/1207.0580
"""
def __init__(self, dropout_ratio=0.5):
self.dropout_ratio = dropout_ratio

def forward(self, x, train_flg=True):
if train_flg:
self.mask = np.random.rand(*x.shape) > self.dropout_ratio
return x * self.mask
else:
return x * (1.0 - self.dropout_ratio)

def backward(self, dout):
return dout * self.mask
``````

## 4.4 Trainer类

``````# coding: utf-8
import sys, os
sys.path.append(os.pardir)  # 为了导入父目录的文件而进行的设定
import numpy as np
from common.optimizer import *

class Trainer:
"""进行神经网络的训练的类
"""
def __init__(self, network, x_train, t_train, x_test, t_test,
epochs=20, mini_batch_size=100,
optimizer='SGD', optimizer_param={'lr':0.01},
evaluate_sample_num_per_epoch=None, verbose=True):
self.network = network
self.verbose = verbose
self.x_train = x_train
self.t_train = t_train
self.x_test = x_test
self.t_test = t_test
self.epochs = epochs
self.batch_size = mini_batch_size
self.evaluate_sample_num_per_epoch = evaluate_sample_num_per_epoch

# optimzer
optimizer_class_dict = {'sgd':SGD, 'momentum':Momentum, 'nesterov':Nesterov,
self.optimizer = optimizer_class_dict[optimizer.lower()](**optimizer_param)

self.train_size = x_train.shape[0]
self.iter_per_epoch = max(self.train_size / mini_batch_size, 1)
self.max_iter = int(epochs * self.iter_per_epoch)
self.current_iter = 0
self.current_epoch = 0

self.train_loss_list = []
self.train_acc_list = []
self.test_acc_list = []

def train_step(self):
batch_mask = np.random.choice(self.train_size, self.batch_size)

loss = self.network.loss(x_batch, t_batch)
self.train_loss_list.append(loss)
if self.verbose: print("train loss:" + str(loss))

if self.current_iter % self.iter_per_epoch == 0:
self.current_epoch += 1

x_train_sample, t_train_sample = self.x_train, self.t_train
x_test_sample, t_test_sample = self.x_test, self.t_test
if not self.evaluate_sample_num_per_epoch is None:
t = self.evaluate_sample_num_per_epoch
x_train_sample, t_train_sample = self.x_train[:t], self.t_train[:t]
x_test_sample, t_test_sample = self.x_test[:t], self.t_test[:t]

train_acc = self.network.accuracy(x_train_sample, t_train_sample)
test_acc = self.network.accuracy(x_test_sample, t_test_sample)
self.train_acc_list.append(train_acc)
self.test_acc_list.append(test_acc)

if self.verbose: print("=== epoch:" + str(self.current_epoch) + ", train acc:" + str(train_acc) + ", test acc:" + str(test_acc) + " ===")
self.current_iter += 1

def train(self):
for i in range(self.max_iter):
self.train_step()

test_acc = self.network.accuracy(self.x_test, self.t_test)

if self.verbose:
print("=============== Final Test Accuracy ===============")
print("test acc:" + str(test_acc))
``````

# 5. 超参数的验证

• 神经元数量
• batch大小
• 参数更新时的学习率或权值衰减

## 5.1 验证数据

• 训练数据：用于参数(权重和偏置)的学习
• 验证数据：用于超参数的性能评估
• 测试数据：最后使用(比较理想的是只用一次)，用于确认泛化能力

``````(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True)

# 分割验证数据
validation_rate = 0.20
validation_num = x_train.shape[0] * validation_rate
x_train, t_train = shuffle_dataset(x_train, t_train)
x_val = x_train[:validation_num]
t_val = t_train[:validation_num]
x_train = x_train[validation_num:]
t_train = t_train[validation_num:]
``````

``````def shuffle_dataset(x, t):
"""打乱数据集

Parameters
----------
x : 训练数据
t : 监督数据

Returns
-------
x, t : 打乱的训练数据和监督数据
"""
permutation = np.random.permutation(x.shape[0])
x = x[permutation,:] if x.ndim == 2 else x[permutation,:,:,:]
t = t[permutation]

return x, t
``````

## 5.2 超参数的最优化

1. 设定超参数的范围
2. 从设定的超参数范围中随机采样
3. 使用步骤1采样的超参数值进行学习，通过验证数据评估识别精度
4. 重复上面1和2(100次)，根据他们的识别精度结果，缩小超参数的范围

## 5.3 超参数最优化的实现

• 学习率初始范围是 `10**(-6)到10**(-2)`之间
• 权值衰减系数初始范围是 `10**(-8)到10**(-4)`之间

``````    # 指定搜索的超参数的范围===============
weight_decay = 10 ** np.random.uniform(-8, -4)
lr = 10 ** np.random.uniform(-6, -2)
# ================================================
``````

``````# coding: utf-8
import sys, os
sys.path.append(os.pardir)  # 为了导入父目录的文件而进行的设定
import numpy as np
import matplotlib.pyplot as plt
from dataset.mnist import load_mnist
from common.multi_layer_net import MultiLayerNet
from common.util import shuffle_dataset
from common.trainer import Trainer

(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True)

# 为了实现高速化，减少训练数据
x_train = x_train[:500]
t_train = t_train[:500]

# 分割验证数据
validation_rate = 0.20
validation_num = int(x_train.shape[0] * validation_rate)
print(validation_num )
x_train, t_train = shuffle_dataset(x_train, t_train)
x_val = x_train[:validation_num]
t_val = t_train[:validation_num]
x_train = x_train[validation_num:]
t_train = t_train[validation_num:]

def __train(lr, weight_decay, epocs=50):
network = MultiLayerNet(input_size=784, hidden_size_list=[100, 100, 100, 100, 100, 100],
output_size=10, weight_decay_lambda=weight_decay)
trainer = Trainer(network, x_train, t_train, x_val, t_val,
epochs=epocs, mini_batch_size=100,
optimizer='sgd', optimizer_param={'lr': lr}, verbose=False)
trainer.train()

return trainer.test_acc_list, trainer.train_acc_list

# 超参数的随机搜索======================================
optimization_trial = 100
results_val = {}
results_train = {}
for _ in range(optimization_trial):
# 指定搜索的超参数的范围===============
weight_decay = 10 ** np.random.uniform(-8, -4)
lr = 10 ** np.random.uniform(-6, -2)
# ================================================

val_acc_list, train_acc_list = __train(lr, weight_decay)
print("val acc:" + str(val_acc_list[-1]) + " | lr:" + str(lr) + ", weight decay:" + str(weight_decay))
key = "lr:" + str(lr) + ", weight decay:" + str(weight_decay)
results_val[key] = val_acc_list
results_train[key] = train_acc_list

# 绘制图形========================================================
print("=========== Hyper-Parameter Optimization Result ===========")
graph_draw_num = 20
col_num = 5
row_num = int(np.ceil(graph_draw_num / col_num))
i = 0

for key, val_acc_list in sorted(results_val.items(), key=lambda x:x[1][-1], reverse=True):
print("Best-" + str(i+1) + "(val acc:" + str(val_acc_list[-1]) + ") | " + key)

plt.subplot(row_num, col_num, i+1)
plt.title("Best-" + str(i+1))
plt.ylim(0.0, 1.0)
if i % 5: plt.yticks([])
plt.xticks([])
x = np.arange(len(val_acc_list))
plt.plot(x, val_acc_list)
plt.plot(x, results_train[key], "--")
i += 1

if i >= graph_draw_num:
break

plt.show()

``````
``````=========== Hyper-Parameter Optimization Result ===========
Best-1(val acc:0.82) | lr:0.008601187445014789, weight decay:4.142947393287626e-06
Best-2(val acc:0.8) | lr:0.008284953249444884, weight decay:1.3047654928102771e-08
Best-3(val acc:0.79) | lr:0.00797182644121508, weight decay:4.530264809019064e-07
Best-4(val acc:0.77) | lr:0.006358159795538981, weight decay:4.753737847569889e-06
Best-5(val acc:0.77) | lr:0.005930410669775257, weight decay:1.1280065697356176e-07
``````