# 机器学习算法实验的重复次数估计

### 准备数据

from numpy.random import seed
from numpy.random import normal
from numpy import savetxt
# define underlying distribution of results
mean = 60
stev = 10
# generate samples from ideal distribution
seed(1)
results = normal(mean, stev, 1000)
# save to ASCII file
savetxt('results.csv', results)

...
6.160564991742511864e+01
5.879850024371251038e+01
6.385602292344325548e+01
6.718290735754342791e+01
7.291188902850875309e+01
5.883555851728335995e+01
3.722702003339634302e+01
5.930375460544870947e+01
6.353870426882840405e+01
5.813044983467250404e+01

### 基本分析

1. 计算统计信息，比如均值、标准差、百分位等等；
2. 对数据绘制箱形图或者；
3. 绘制数据的直方图分布。

from pandas import DataFrame
from numpy import mean
from numpy import std
from matplotlib import pyplot
# descriptive stats
print(results.describe())
# box and whisker plot
results.boxplot()
pyplot.show()
# histogram
results.hist()
pyplot.show()

count  1000.000000
mean     60.388125
std       9.814950
min      29.462356
25%      53.998396
50%      60.412926
75%      67.039989
max      99.586027

### 重复次数的影响

from pandas import DataFrame
from numpy import mean
from matplotlib import pyplot
import numpy
values = results.values
# collect cumulative stats
means = list()
for i in range(1,len(values)+1):
data = values[0:i, 0]
mean_rmse = mean(data)
means.append(mean_rmse)
# line plot of cumulative values
pyplot.plot(means)
pyplot.show()

from pandas import DataFrame
from numpy import mean
from matplotlib import pyplot
import numpy
values = results.values
final_mean = mean(values)
# collect cumulative stats
means = list()
for i in range(1,501):
data = values[0:i, 0]
mean_rmse = mean(data)
means.append(mean_rmse)
# line plot of cumulative values
pyplot.plot(means)
pyplot.plot([final_mean for x in range(len(means))])
pyplot.show()

### 计算标准误差

from pandas import read_csv
from numpy import std
from numpy import mean
from matplotlib import pyplot
from math import sqrt
values = results.values
# collect cumulative stats
std_errors = list()
for i in range(1,len(values)+1):
data = values[0:i, 0]
stderr = std(data) / sqrt(len(data))
std_errors.append(stderr)
# line plot of cumulative values
pyplot.plot(std_errors)
pyplot.show()

from pandas import read_csv
from numpy import std
from numpy import mean
from matplotlib import pyplot
from math import sqrt
values = results.values
# collect cumulative stats
std_errors = list()
for i in range(1,len(values)+1):
data = values[0:i, 0]
stderr = std(data) / sqrt(len(data))
std_errors.append(stderr)
# line plot of cumulative values
pyplot.plot(std_errors)
pyplot.plot([0.5 for x in range(len(std_errors))], color='red')
pyplot.plot([1 for x in range(len(std_errors))], color='red')
pyplot.show()

from pandas import read_csv
from numpy import std
from numpy import mean
from matplotlib import pyplot
from math import sqrt
values = results.values
# collect cumulative stats
means, confidence = list(), list()
n = len(values) + 1
for i in range(20,n):
data = values[0:i, 0]
mean_rmse = mean(data)
stderr = std(data) / sqrt(len(data))
conf = stderr * 1.96
means.append(mean_rmse)
confidence.append(conf)
# line plot of cumulative values
pyplot.errorbar(range(20, n), means, yerr=confidence)
pyplot.plot(range(20, n), [60 for x in range(len(means))], color='red')
pyplot.show()

from pandas import read_csv
from numpy import std
from numpy import mean
from matplotlib import pyplot
from math import sqrt
values = results.values
# collect cumulative stats
means, confidence = list(), list()
n = 200 + 1
for i in range(20,n):
data = values[0:i, 0]
mean_rmse = mean(data)
stderr = std(data) / sqrt(len(data))
conf = stderr * 1.96
means.append(mean_rmse)
confidence.append(conf)
# line plot of cumulative values
pyplot.errorbar(range(20, n), means, yerr=confidence)
pyplot.plot(range(20, n), [60 for x in range(len(means))], color='red')
pyplot.show()

### 小结

• 简单地尝试重复30次、100次或者1000次等等；
• 绘制样本均值与重复次数的关系图，并根据拐点选择；
• 绘制标准误差与重复次数的关系图，并根据误差阈值选择；
• 绘制置信区间与重复次数的关系图，并根据误差的分布选择。

【6位AI技术大咖分享研发经验 | 本周直播限时特惠 】本期直播（5月13日 周六）邀请来自阿里巴巴、思必驰、第四范式、一点资讯、58集团、PercepIn等在AI领域有着领先技术研究的一批专家，他们将针对人脸识别、卷积神经网络、大规模分布式机器学习系统搭建、推荐系统、自然语言处理及SLAM在机器人领域应用等热点话题进行分享。限时特惠：199元即可听6位技术专家的在线分享，点击报名，加微信小助手 csdncxrs 备注“人工智能”入群。

#### 实验二 分类算法实验

2017-12-24 16:32:06

#### 文本分类与SVM 原文地址：http://blog.csdn.net/zhzhl202/article/details/8197109

2016-12-06 14:38:03

#### 深度解析机器学习中的置信区间（附代码）

2018-07-02 19:00:00

#### 机器学习中防止过拟合的处理方法

2015-10-26 20:58:12

#### 【TensorFlow】学习率、迭代次数和初始化方式对准确率的影响

2016-10-29 16:46:50

#### 神经网络（持续更新）

2017-04-19 18:46:38

#### 实验四 深度学习算法及应用

2018-01-01 15:04:00

#### 机器学习中的偏差，方差，训练误差，测试误差相关

2017-08-14 11:25:20

#### 8种常见机器学习算法比较

2016-10-26 20:35:41

#### 几个常用机器学习算法 - 决策树算法

2016-10-25 18:03:33