欠拟合,过拟合与正则

欠拟合:

指的是在训练集上,训练误差较高
欠拟合可能的原因:复杂的数据,配上简单的模型

过拟合

指的训练集上的训练误差较低,但是测试集上的泛化误差较高
过拟合可能的原因:训练的数据量过少,模型过于强大,误认为学习到了所有,导致最终的泛化误差较大。

测试模型

模型
L2正则

欠拟合过拟合code

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# y= 1.2x-3.4x**2+5.6x**3+5.0+noise
from mxnet import ndarray as nd
from mxnet import autograd as ag
from mxnet import gluon

num_train = 100
num_test = 100
true_w = [1.2,-3.4,5.6]
true_b = 5.0
x = nd.random_normal(shape=(num_test+num_train,1))
X = nd.concat(x,nd.power(x,2),nd.power(x,3))
y = true_w[0]*X[:,0]+true_w[1]*X[:,1]+true_w[2]*X[:,2]+true_b
y += nd.random_normal(shape=y.shape)

import matplotlib as mpl
#指定图的大小
mpl.rcParams['figure.dpi']= 120
import matplotlib.pyplot as plt

def train(x_train,x_test,y_train,y_test):
net = gluon.nn.Sequential()
with net.name_scope():
net.add(gluon.nn.Dense(1))
net.initialize()
dataset = gluon.data.ArrayDataset(x_train,y_train)
epoch = 100
learning_rate = 0.01
batch_size = min(10,y_train.shape[0])
data_iter = gluon.data.DataLoader(dataset,batch_size,shuffle=True)
square_loss = gluon.loss.L2Loss()
trainer = gluon.Trainer(net.collect_params(),'sgd',{'learning_rate':learning_rate})
train_loss = []
test_loss = []
for e in range(epoch):
for data,label in data_iter:
with ag.record():
output = net(data)
loss = square_loss(output,label)
loss.backward()
trainer.step(batch_size)
temp_train = square_loss(net(x_train),y_train).mean().asscalar()
train_loss.append(temp_train)
temp_test = square_loss(net(x_test),y_test).mean().asscalar()
print('train loss,Test loss',temp_train,temp_test)
test_loss.append(temp_test)
plt.plot(train_loss)
plt.plot(test_loss)
plt.legend(['train','test'])
plt.show()
print('learned weight',net[0].weight.data())
print('learned weight', net[0].bias.data())

# 多数据,复杂泛化性挺好
print('x 的shape,X的shape',x.shape,X.shape)
train(X[:num_train, :], X[num_train:, :], y[:num_train], y[num_train:])
train(x[:num_train, :], x[num_train:, :], y[:num_train], y[num_train:])
#训练量不足用复杂模型
train(X[0:2, :], X[num_train:, :], y[0:2], y[num_train:])

正则code

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from mxnet import ndarray as nd
from mxnet import autograd as ag
from mxnet import gluon
#加正则的目的是弱化模型,降低模型的复杂度。正则是加在loss上的,给个权值,来表示减弱
#模型的复杂程度。最小化新loss,显然会导致w,b有向0的趋势

num_train = 20
num_test = 100
num_inputs = 200
#
true_w = nd.ones(shape=(num_inputs,1))*0.01

true_b = 0.05
x = nd.random_normal(shape=(num_train+num_test,num_inputs))
y = nd.dot(x,true_w)+true_b
y += nd.random_normal(shape=y.shape)*0.01
batch_size =1
dataset = gluon.data.ArrayDataset(x[:num_train,:],y[:num_train,:])
train_data = gluon.data.DataLoader(dataset,batch_size,shuffle=True)
w = nd.random_normal(shape=(num_inputs,1),scale=1)
b = nd.zeros(shape=(1,))
lr = 0.01

params = [w,b]
for param in params:
param.attach_grad()

def net(x):
return nd.dot(x,w)+b

#le范数正则化
def L2_penalty(w,b):
return ((w**2).sum()+b**2)/2

#主要看一下loss
def square_loss(yhat,y):
return (yhat-y.reshape(yhat.shape))**2/2

def SGD(params):
for param in params:
param[:] = param - lr*param.grad/batch_size

import matplotlib as mpl
#指定图的大小
mpl.rcParams['figure.dpi']= 120
import matplotlib.pyplot as plt

def train(lamda):
epoch = 5
train_loss = []
test_loss = []
for e in range(epoch):
for data,label in train_data:
with ag.record():
yhat = net(data)
loss = square_loss(yhat,label)+lamda*L2_penalty(*params)
loss.backward()
SGD(params)
train_loss.append(square_loss(net(x[:num_train,:]),y[:num_train,:]).mean().asscalar())
test_loss.append(square_loss(net(x[num_train:,:]),y[num_train:,:]).mean().asscalar())
plt.plot(train_loss)
plt.plot(test_loss)
plt.legend(['train', 'test'])
plt.show()
train(0)
train(3)

添加:另一种防止模型过拟合的方法

Dropout

Dropout总的来说分为两部分
1.对于该层的输出结果,以一定的概率选择丢弃元素乘0
2.把非丢弃元素拉伸(保证期望不变)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
from mxnet import nd

def dropout(X, drop_probability):
keep_probability = 1 - drop_probability
# 全部元素都丢弃。
if keep_probability == 0:
return X.zeros_like()

# 随机选择一部分该层的输出作为丢弃元素。
mask = nd.random.uniform(
0, 1.0, X.shape) < keep_probability
# 保证 E[dropout(X)] == X
scale = 1 / keep_probability
return mask * X * scale

Dropout的实质:
Dropout的解释还是很多的,有人认为是集成学习不是很能理解,黑盒,并不是加Dropout一定比不加会有更高的准确率。
—————————————————————————————————————-update

隐藏层中每个神经元都有可能被丢弃,那么最后的输出无法过分依赖单个神经元,即参数接近为0,一定程度上起到了正则化的作用。