文章目录
机器学习——基础算法(二)
一、回归实践
(一)局部加权回归
(二)一般使用Logistic回归和Softmax回归进行分类。
#!/usr/bin/python
# -*- coding:utf-8 -*-
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.patches as mpatches
if __name__ == "__main__":
path = 'iris.data' # 数据文件路径
# # # 手写读取数据
# f = file(path)
# x = []
# y = []
# for d in f:
# d = d.strip()
# if d:
# d = d.split(',')
# y.append(d[-1])
# x.append(map(float, d[:-1]))
# print '原始数据X:\n', x
# print '原始数据Y:\n', y
# x = np.array(x)
# print 'Numpy格式X:\n', x
# y = np.array(y)
# print 'Numpy格式Y - 1:\n', y
# y[y == 'Iris-setosa'] = 0
# y[y == 'Iris-versicolor'] = 1
# y[y == 'Iris-virginica'] = 2
# print 'Numpy格式Y - 2:\n', y
# y = y.astype(dtype=np.int)
# print 'Numpy格式Y - 3:\n', y
# print '\n\n============================================\n\n'
# # 使用sklearn的数据预处理
# df = pd.read_csv(path, header=None)
# x = df.values[:, :-1]
# y = df.values[:, -1]
# print x.shape
# print y.shape
# print 'x = \n', x
# print 'y = \n', y
# le = preprocessing.LabelEncoder()
# le.fit(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'])
# print le.classes_
# y = le.transform(y)
# print 'Last Version, y = \n', y
# def iris_type(s):
# it = {'Iris-setosa': 0,
# 'Iris-versicolor': 1,
# 'Iris-virginica': 2}
# return it[s]
#
# # 路径,浮点型数据,逗号分隔,第4列使用函数iris_type单独处理
# data = np.loadtxt(path, dtype=float, delimiter=',',
# converters={4: iris_type})
data = pd.read_csv(path, header=None)
data[4] = pd.Categorical(data[4]).codes
# iris_types = data[4].unique()
# print iris_types
# for i, type in enumerate(iris_types):
# data.set_value(data[4] == type, 4, i)
x, y = np.split(data.values, (4,), axis=1)
# print 'x = \n', x
# print 'y = \n', y
# 仅使用前两列特征
x = x[:, :2]
lr = Pipeline([('sc', StandardScaler()),
('poly', PolynomialFeatures(degree=2)),
('clf', LogisticRegression()) ])
lr.fit(x, y.ravel())
y_hat = lr.predict(x)
y_hat_prob = lr.predict_proba(x)
np.set_printoptions(suppress=True)
print ('y_hat = \n', y_hat)
print ('y_hat_prob = \n', y_hat_prob)
print (u'准确度:%.2f%%' % (100*np.mean(y_hat == y.ravel())))
# 画图
N, M = 500, 500 # 横纵各采样多少个值
x1_min, x1_max = x[:, 0].min(), x[:, 0].max() # 第0列的范围
x2_min, x2_max = x[:, 1].min(), x[:, 1].max() # 第1列的范围
t1 = np.linspace(x1_min, x1_max, N)
t2 = np.linspace(x2_min, x2_max, M)
x1, x2 = np.meshgrid(t1, t2) # 生成网格采样点
x_test = np.stack((x1.flat, x2.flat), axis=1) # 测试点
# # 无意义,只是为了凑另外两个维度
# x3 = np.ones(x1.size) * np.average(x[:, 2])
# x4 = np.ones(x1.size) * np.average(x[:, 3])
# x_test = np.stack((x1.flat, x2.flat, x3, x4), axis=1) # 测试点
mpl.rcParams['font.sans-serif'] = [u'simHei']
mpl.rcParams['axes.unicode_minus'] = False
cm_light = mpl.colors.ListedColormap(['#77E0A0', '#FF8080', '#A0A0FF'])
cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b'])
y_hat = lr.predict(x_test) # 预测值
y_hat = y_hat.reshape(x1.shape) # 使之与输入的形状相同
plt.figure(facecolor='w')
plt.pcolormesh(x1, x2, y_hat, cmap=cm_light) # 预测值的显示
plt.scatter(x[:, 0], x[:, 1], c=np.squeeze(y), edgecolors='k', s=50, cmap=cm_dark) # 样本的显示
plt.xlabel(u'花萼长度', fontsize=14)
plt.ylabel(u'花萼宽度', fontsize=14)
plt.xlim(x1_min, x1_max)
plt.ylim(x2_min, x2_max)
plt.grid()
patchs = [mpatches.Patch(color='#77E0A0', label='Iris-setosa'),
mpatches.Patch(color='#FF8080', label='Iris-versicolor'),
mpatches.Patch(color='#A0A0FF', label='Iris-virginica')]
plt.legend(handles=patchs, fancybox=True, framealpha=0.8)
plt.title(u'鸢尾花Logistic回归分类效果 - 标准化', fontsize=17)
plt.show()
(二)
(三)AUC(分类器指标)
# -*-coding:utf-8-*-
import numbers
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVC
from sklearn.preprocessing import label_binarize
from numpy import interp
from sklearn import metrics
from itertools import cycle
if __name__ == '__main__':
np.random.seed(0)
pd.set_option('display.width', 300)
np.set_printoptions(suppress=True)
n = 300
x = np.random.randn(n, 50)
y = np.array([0]*100+[1]*100+[2]*100)
n_class = 3
alpha = np.logspace(-3, 3, 7)
clf = LogisticRegression(penalty='l2', C=1)
clf.fit(x, y)
y_score = clf.decision_function(x)
y = label_binarize(y, classes=np.arange(n_class))
colors = cycle('gbc')
fpr = dict()
tpr = dict()
auc = np.empty(n_class+2)
mpl.rcParams['font.sans-serif'] = u'SimHei'
mpl.rcParams['axes.unicode_minus'] = False
plt.figure(figsize=(7, 6), facecolor='w')
for i, color in zip(np.arange(n_class), colors):
fpr[i], tpr[i], thresholds = metrics.roc_curve(y[:, i], y_score[:, i])
auc[i] = metrics.auc(fpr[i], tpr[i])
plt.plot(fpr[i], tpr[i], c=color, lw=1.5, alpha=0.7, label=u'AUC=%.3f' % auc[i])
# micro
fpr['micro'], tpr['micro'], thresholds = metrics.roc_curve(y.ravel(), y_score.ravel())
auc[n_class] = metrics.auc(fpr['micro'], tpr['micro'])
plt.plot(fpr['micro'], tpr['micro'], c='r', lw=2, ls='-', alpha=0.8, label=u'micro,AUC=%.3f' % auc[n_class])
# macro
fpr['macro'] = np.unique(np.concatenate([fpr[i] for i in np.arange(n_class)]))
tpr_ = np.zeros_like(fpr['macro'])
for i in np.arange(n_class):
tpr_ += interp(fpr['macro'], fpr[i], tpr[i])
tpr_ /= n_class
tpr['macro'] = tpr_
auc[n_class+1] = metrics.auc(fpr['macro'], tpr['macro'])
print (auc)
print ('Macro AUC:', metrics.roc_auc_score(y, y_score, average='macro'))
plt.plot(fpr['macro'], tpr['macro'], c='m', lw=2, alpha=0.8, label=u'macro,AUC=%.3f' % auc[n_class+1])
plt.plot((0, 1), (0, 1), c='#808080', lw=1.5, ls='--', alpha=0.7)
plt.xlim((-0.01, 1.02))
plt.ylim((-0.01, 1.02))
plt.xticks(np.arange(0, 1.1, 0.1))
plt.yticks(np.arange(0, 1.1, 0.1))
plt.xlabel('False Positive Rate', fontsize=13)
plt.ylabel('True Positive Rate', fontsize=13)
plt.grid(b=True)
plt.legend(loc='lower right', fancybox=True, framealpha=0.8, fontsize=12)
# plt.legend(loc='lower right', fancybox=True, framealpha=0.8, edgecolor='#303030', fontsize=12)
plt.title(u'ROC和AUC', fontsize=17)
plt.show()
(三)用线性模型做预测
做差分y(i)-y(i-1)没有明显趋势增长,但是方差变大,即震荡加大。解决办法:取对数。
模型一:自回归
模型二:滑动平均;抹去周期性的震荡线
AR是自回归,MA是滑动平均 ,RMSE是差距值