1.高斯朴素贝叶斯对鸢尾花数据进行分类
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
if __name__ == "__main__":
data = pd.read_csv('iris.data', header=None)
x, y = data[np.arange(4)], data[4]
y = pd.Categorical(values=y).codes
print(y)
feature_names = u'花萼长度', u'花萼宽度', u'花瓣长度', u'花瓣宽度'
features = [0,1]
x = x[features]
x, x_test, y, y_test = train_test_split(x, y, train_size=0.7, random_state=0)
priors = np.array((1,2,4), dtype=float)
priors /= priors.sum()
gnb = Pipeline([
('sc', StandardScaler()),
('poly', PolynomialFeatures(degree=1)),
('clf', GaussianNB(priors=priors))])
gnb.fit(x, y.ravel())
y_hat = gnb.predict(x)
print('训练集准确度: %.2f%%' % (100 * accuracy_score(y, y_hat)))
y_test_hat = gnb.predict(x_test)
print('测试集准确度:%.2f%%' % (100 * accuracy_score(y_test, y_test_hat)))
N, M = 500, 500
x1_min, x2_min = x.min()
x1_max, x2_max = x.max()
t1 = np.linspace(x1_min, x1_max, N)
t2 = np.linspace(x2_min, x2_max, M)
x1, x2 = np.meshgrid(t1, t2)
x_grid = np.stack((x1.flat, x2.flat), axis=1)
mpl.rcParams['font.sans-serif'] = [u'simHei']
mpl.rcParams['axes.unicode_minus'] = False
cm_light = mpl.colors.ListedColormap(['#77E0A0', '#FF8080', '#A0A0FF'])
cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b'])
y_grid_hat = gnb.predict(x_grid)
y_grid_hat = y_grid_hat.reshape(x1.shape)
plt.figure(facecolor='w')
plt.pcolormesh(x1, x2, y_grid_hat, cmap=cm_light)
plt.scatter(x[features[0]], x[features[1]], c=y, edgecolors='k', s=50, cmap=cm_dark)
plt.scatter(x_test[features[0]], x_test[features[1]], c=y_test, marker='^', edgecolors='k', s=120, cmap=cm_dark)
plt.xlabel(feature_names[features[0]], fontsize=13)
plt.ylabel(feature_names[features[1]], fontsize=13)
plt.xlim(x1_min, x1_max)
plt.ylim(x2_min, x2_max)
plt.title(u'GaussianNB对鸢尾花数据的分类结果', fontsize=18)
plt.grid(True)
plt.show()
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2]
训练集准确度: 82.86%
测试集准确度:71.11%
2.文本数据的处理流程-20个类别的新闻组数据
import numpy as np
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from time import time
from pprint import pprint
import matplotlib.pyplot as plt
import matplotlib as mpl
def test_clf(clf):
print(u'分类器:', clf)
alpha_can = np.logspace(-3, 2, 10)
model = GridSearchCV(clf, param_grid={'alpha': alpha_can}, cv=5)
m = alpha_can.size
if hasattr(clf, 'alpha'):
'''判断对象中是否具有该属性'''
model.set_params(param_grid={'alpha': alpha_can})
m = alpha_can.size
if hasattr(clf, 'n_neighbors'):
neighbors_can = np.arange(1, 15)
model.set_params(param_grid={'n_neighbors': neighbors_can})
m = neighbors_can.size
if hasattr(clf, 'C'):
C_can = np.logspace(1, 3, 3)
gamma_can = np.logspace(-3, 0, 3)
model.set_params(param_grid={'C':C_can, 'gamma':gamma_can})
m = C_can.size * gamma_can.size
if hasattr(clf, 'max_depth'):
max_depth_can = np.arange(4, 10)
model.set_params(param_grid={'max_depth': max_depth_can})
m = max_depth_can.size
t_start = time()
model.fit(x_train, y_train)
t_end = time()
t_train = (t_end - t_start) / (5*m)
print(u'5折交叉验证的训练时间为:%.3f秒/(5*%d)=%.3f秒' % ((t_end - t_start), m, t_train))
print(u'最优超参数为:', model.best_params_)
t_start = time()
y_hat = model.predict(x_test)
t_end = time()
t_test = t_end - t_start
print(u'测试时间:%.3f秒' % t_test)
acc = metrics.accuracy_score(y_test, y_hat)
print(u'测试集准确率:%.2f%%' % (100 * acc))
name = str(clf).split('(')[0]
index = name.find('Classifier')
if index != -1:
name = name[:index]
if name == 'SVC':
name = 'SVM'
return t_train, t_test, 1-acc, name
if __name__ == "__main__":
print(u'开始下载/加载数据...')
t_start = time()
remove = ()
categories = 'alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space'
data_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=0, remove=remove)
data_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=0, remove=remove)
t_end = time()
print(u'下载/加载数据完成,耗时%.3f秒' % (t_end - t_start))
print(u'数据类型:', type(data_train))
print(u'训练集包含的文本数目:', len(data_train.data))
print(u'测试集包含的文本数目:', len(data_test.data))
print(u'训练集和测试集使用的%d个类别的名称:' % len(categories))
categories = data_train.target_names
pprint(categories)
y_train = data_train.target
y_test = data_test.target
print(u' -- 前10个文本 -- ')
for i in np.arange(10):
print(u'文本%d(属于类别 - %s):' % (i+1, categories[y_train[i]]))
print(data_train.data[i])
print('\n\n')
vectorizer = TfidfVectorizer(input='content', stop_words='english', max_df=0.5, sublinear_tf=True)
x_train = vectorizer.fit_transform(data_train.data)
x_test = vectorizer.transform(data_test.data)
print(u'训练集样本个数:%d,特征个数:%d' % x_train.shape)
print(u'停止词:\n')
pprint(vectorizer.get_stop_words())
feature_names = np.asarray(vectorizer.get_feature_names())
print(u'\n\n===================\n分类器的比较:\n')
clfs = (MultinomialNB(),
BernoulliNB(),
KNeighborsClassifier(),
RidgeClassifier(),
RandomForestClassifier(n_estimators=200),
SVC()
)
result = []
for clf in clfs:
a = test_clf(clf)
result.append(a)
print('\n')
result = np.array(result)
time_train, time_test, err, names = result.T
time_train = time_train.astype(np.float)
time_test = time_test.astype(np.float)
err = err.astype(np.float)
x = np.arange(len(time_train))
mpl.rcParams['font.sans-serif'] = [u'simHei']
mpl.rcParams['axes.unicode_minus'] = False
plt.figure(figsize=(10, 7), facecolor='w')
ax = plt.axes()
b1 = ax.bar(x, err, width=0.25, color='#77E0A0')
ax_t = ax.twinx()
b2 = ax_t.bar(x+0.25, time_train, width=0.25, color='#FFA0A0')
b3 = ax_t.bar(x+0.5, time_test, width=0.25, color='#FF8080')
plt.xticks(x+0.5, names)
plt.legend([b1[0], b2[0], b3[0]], (u'错误率', u'训练时间', u'测试时间'), loc='upper left', shadow=True)
plt.title(u'新闻组文本数据不同分类器间的比较', fontsize=18)
plt.xlabel(u'分类器名称')
plt.grid(True)
plt.tight_layout(2)
plt.show()
分类器的比较:
分类器: MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
5折交叉验证的训练时间为:0.540秒/(5*10)=0.011秒
最优超参数为: {'alpha': 0.003593813663804626}
测试时间:0.004秒
测试集准确率:89.58%
分类器: BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
5折交叉验证的训练时间为:0.862秒/(5*10)=0.017秒
最优超参数为: {'alpha': 0.001}
测试时间:0.009秒
测试集准确率:88.54%
分类器: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=5, p=2,
weights='uniform')
5折交叉验证的训练时间为:4.213秒/(5*14)=0.060秒
最优超参数为: {'n_neighbors': 3}
测试时间:0.193秒
测试集准确率:86.03%
分类器: RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
max_iter=None, normalize=False, random_state=None,
solver='auto', tol=0.001)
5折交叉验证的训练时间为:7.214秒/(5*10)=0.144秒
最优超参数为: {'alpha': 0.001}
测试时间:0.002秒
测试集准确率:89.28%
分类器: RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=200,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False)
5折交叉验证的训练时间为:27.850秒/(5*6)=0.928秒
最优超参数为: {'max_depth': 9}
测试时间:0.147秒
测试集准确率:77.16%
分类器: SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False)
5折交叉验证的训练时间为:170.046秒/(5*9)=3.779秒
最优超参数为: {'C': 100.0, 'gamma': 0.03162277660168379}
测试时间:1.795秒
测试集准确率:90.10%
3.