1.内容
鸢尾花分类识别
2.代码
Adveritising.py
#!/usr/bin/python
# -*- coding:utf-8 -*-
import csv
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
if __name__ == "__main__":
path = '8.Advertising.csv'
# # 读写方式1:手写读取数据 - 请自行分析,在8.2.Iris代码中给出类似的例子
# f = file(path)
# x = []
# y = []
# for i, d in enumerate(f):
# if i == 0:
# continue
# d = d.strip() #将无关字符去掉
# if not d:
# continue
# d = map(float, d.split(','))
# x.append(d[1:-1])
# y.append(d[-1])
# print x
# print y
# x = np.array(x)
# y = np.array(y)
# # 2:Python自带库
# f = file(path, 'rb')
# print f
# d = csv.reader(f)
# for line in d:
# print line
# f.close()
# # 3:numpy读入
# p = np.loadtxt(path, delimiter=',', skiprows=1)
# print p
# 4:pandas读入
data = pd.read_csv(path) # TV、Radio、Newspaper、Sales
x = data[['TV', 'Radio', 'Newspaper']]
# x = data[['TV', 'Radio']]
y = data['Sales']
print x
print y
# # 绘制1
plt.plot(data['TV'], y, 'ro', label='TV')
plt.plot(data['Radio'], y, 'g^', label='Radio')
plt.plot(data['Newspaper'], y, 'mv', label='Newspaer')
plt.legend(loc='lower right')
plt.grid()
plt.show()
# #
# # 绘制2
# plt.figure(figsize=(9,12))
# plt.subplot(311)
# plt.plot(data['TV'], y, 'ro')
# plt.title('TV')
# plt.grid()
# plt.subplot(312)
# plt.plot(data['Radio'], y, 'g^')
# plt.title('Radio')
# plt.grid()
# plt.subplot(313)
# plt.plot(data['Newspaper'], y, 'b*')
# plt.title('Newspaper')
# plt.grid()
# plt.tight_layout()
# plt.show()
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)
# print x_train, y_train
linreg = LinearRegression()
model = linreg.fit(x_train, y_train)
print model
print linreg.coef_
print linreg.intercept_
y_hat = linreg.predict(np.array(x_test))
mse = np.average((y_hat - np.array(y_test)) ** 2) # Mean Squared Error
rmse = np.sqrt(mse) # Root Mean Squared Error
print mse, rmse
t = np.arange(len(x_test))
plt.plot(t, y_test, 'r-', linewidth=2, label='Test')
plt.plot(t, y_hat, 'g-', linewidth=2, label='Predict')
plt.legend(loc='upper right')
plt.grid()
plt.show()
LinearRegression_CV.py
#!/usr/bin/python
# -*- coding:utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import GridSearchCV
if __name__ == "__main__":
# pandas读入
data = pd.read_csv('8.Advertising.csv') # TV、Radio、Newspaper、Sales
x = data[['TV', 'Radio', 'Newspaper']]
# x = data[['TV', 'Radio']]
y = data['Sales']
print x
print y
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1) #可以指定训练集的比例,默认0.75
# print x_train, y_train
model = Lasso() # Lasso回归
# model = Ridge() #岭回归
alpha_can = np.logspace(-3, 2, 10)
lasso_model = GridSearchCV(model, param_grid={'alpha': alpha_can}, cv=5) #交叉验证
lasso_model.fit(x, y)
print '验证参数:\n', lasso_model.best_params_
y_hat = lasso_model.predict(np.array(x_test))
mse = np.average((y_hat - np.array(y_test)) ** 2) # Mean Squared Error
rmse = np.sqrt(mse) # Root Mean Squared Error
print mse, rmse
t = np.arange(len(x_test))
plt.plot(t, y_test, 'r-', linewidth=2, label='Test')
plt.plot(t, y_hat, 'g-', linewidth=2, label='Predict')
plt.legend(loc='upper right')
plt.grid()
plt.show()
Iris_LR.py
#!/usr/bin/python
# -*- coding:utf-8 -*-
import numpy as np
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import preprocessing
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
def iris_type(s):
it = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2}
return it[s]
if __name__ == "__main__":
path = u'8.iris.data' # 数据文件路径
# # # 方法1:手写读取数据
# # f = file(path)
# # x = []
# # y = []
# # for d in f:
# # d = d.strip()
# # if d:
# # d = d.split(',')
# # y.append(d[-1])
# # x.append(map(float, d[:-1]))
# # print '原始数据X:\n', x
# # print '原始数据Y:\n', y
# # x = np.array(x)
# # print 'Numpy格式X:\n', x
# # y = np.array(y)
# # print 'Numpy格式Y - 1:\n', y
# # y[y == 'Iris-setosa'] = 0
# # y[y == 'Iris-versicolor'] = 1
# # y[y == 'Iris-virginica'] = 2
# # print 'Numpy格式Y - 2:\n', y
# # y = y.astype(dtype=np.int) #安全转换
# # print 'Numpy格式Y - 3:\n', y
#
# # 2:使用sklearn的数据预处理
# df = pd.read_csv(path, header=0)
# x = df.values[:, :-1]
# y = df.values[:, -1]
# print 'x = \n', x
# print 'y = \n', y
# le = preprocessing.LabelEncoder() #做编码
# le.fit(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'])
# print le.classes_
# y = le.transform(y)
# print 'Last Version, y = \n', y
#
# # 3:路径,浮点型数据,逗号分隔,第4列使用函数iris_type单独处理
data = np.loadtxt(path, dtype=float, delimiter=',', converters={4: iris_type})
print data
# 将数据的0到3列组成x,第4列得到y
x, y = np.split(data, (4,), axis=1)
# 为了可视化,仅使用前两列特征
x = x[:, :2]
#
print x
print y
#
# x = StandardScaler().fit_transform(x)
# lr = LogisticRegression() # Logistic回归模型
# lr.fit(x, y.ravel()) # 根据数据[x,y],计算回归参数
#
# 等价形式,类似生产线
lr = Pipeline([('sc', StandardScaler()),
('clf', LogisticRegression()) ])
lr.fit(x, y.ravel()) #学习,ravel行列转换
# 画图
N, M = 500, 500 # 横纵各采样多少个值
x1_min, x1_max = x[:, 0].min(), x[:, 0].max() # 第0列的范围
x2_min, x2_max = x[:, 1].min(), x[:, 1].max() # 第1列的范围
t1 = np.linspace(x1_min, x1_max, N)
t2 = np.linspace(x2_min, x2_max, M)
x1, x2 = np.meshgrid(t1, t2) # 生成网格采样点
x_test = np.stack((x1.flat, x2.flat), axis=1) # 测试点
# 无意义,只是为了凑另外两个维度
# x3 = np.ones(x1.size) * np.average(x[:, 2])
# x4 = np.ones(x1.size) * np.average(x[:, 3])
# x_test = np.stack((x1.flat, x2.flat, x3, x4), axis=1) # 测试点
cm_light = mpl.colors.ListedColormap(['#77E0A0', '#FF8080', '#A0A0FF'])
cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b'])
y_hat = lr.predict(x_test) # 预测值
y_hat = y_hat.reshape(x1.shape) # 使之与输入的形状相同
plt.pcolormesh(x1, x2, y_hat, cmap=cm_light) # 预测值的显示
plt.scatter(x[:, 0], x[:, 1], c=y, edgecolors='k', s=50, cmap=cm_dark) # 样本的显示
plt.xlabel('petal length')
plt.ylabel('petal width')
plt.xlim(x1_min, x1_max)
plt.ylim(x2_min, x2_max)
plt.grid()
plt.savefig('2.png')
plt.show()
# 训练集上的预测结果
y_hat = lr.predict(x)
y = y.reshape(-1)
result = y_hat == y
print y_hat
print result
acc = np.mean(result)
print '准确度: %.2f%%' % (100 * acc)
overfit.py
#!/usr/bin/python
# -*- coding:utf-8 -*-
import numpy as np
from sklearn.linear_model import LinearRegression, RidgeCV
from sklearn.preprocessing import PolynomialFeatures
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
import matplotlib as mpl
if __name__ == "__main__":
np.random.seed(0)
N = 9
x = np.linspace(0, 6, N) + np.random.randn(N)
x = np.sort(x)
y = x**2 - 4*x - 3 + np.random.randn(N)
x.shape = -1, 1
y.shape = -1, 1
model_1 = Pipeline([
('poly', PolynomialFeatures()), #给定一个度
('linear', LinearRegression(fit_intercept=False))])
model_2 = Pipeline([
('poly', PolynomialFeatures()),
('linear', RidgeCV(alphas=np.logspace(-3, 2, 100), fit_intercept=False))])
models = model_1, model_2
mpl.rcParams['font.sans-serif'] = [u'simHei']
mpl.rcParams['axes.unicode_minus'] = False
np.set_printoptions(suppress=True)
plt.figure(figsize=(9, 11), facecolor='w')
d_pool = np.arange(1, N, 1) # 阶
m = d_pool.size
clrs = [] # 颜色
for c in np.linspace(16711680, 255, m):
clrs.append('#%06x' % c)
line_width = np.linspace(5, 2, m)
titles = u'线性回归', u'Ridge回归'
for t in range(2):
model = models[t]
plt.subplot(2, 1, t+1)
plt.plot(x, y, 'ro', ms=10, zorder=N)
for i, d in enumerate(d_pool):
model.set_params(poly__degree=d)
model.fit(x, y)
lin = model.get_params('linear')['linear']
if t == 0:
print u'%d阶,系数为:' % d, lin.coef_.ravel()
else:
print u'%d阶,alpha=%.6f,系数为:' % (d, lin.alpha_), lin.coef_.ravel()
x_hat = np.linspace(x.min(), x.max(), num=100)
x_hat.shape = -1, 1
y_hat = model.predict(x_hat)
s = model.score(x, y)
print s, '\n'
zorder = N - 1 if (d == 2) else 0
plt.plot(x_hat, y_hat, color=clrs[i], lw=line_width[i], label=(u'%d阶,score=%.3f' % (d, s)), zorder=zorder)
plt.legend(loc='upper left')
plt.grid(True)
plt.title(titles[t], fontsize=16)
plt.xlabel('X', fontsize=14)
plt.ylabel('Y', fontsize=14)
plt.tight_layout(1, rect=(0, 0, 1, 0.95))
plt.suptitle(u'多项式曲线拟合', fontsize=18)
plt.show()