机器学习基础实战练习
代码中csv文件均在项目根目录下
第6章 朴素贝叶斯
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
mushroom = pd.read_csv("mushroom.csv")
mushroom.head()
mushroom.isnull().sum()
mushroom['class'].unique()
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
for col in mushroom.columns:
mushroom[col] = labelencoder.fit_transform(mushroom[col])
mushroom.head()
print(mushroom.groupby('class').size())
from sklearn.naive_bayes import MultinomialNB, GaussianNB
X = mushroom.iloc[:, 1:23]
y = mushroom.iloc[:, 0]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
mnb = MultinomialNB()
mnb.fit(X_train, y_train)
print(mnb.score(X_train, y_train))
print(mnb.score(X_test, y_test))
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5)):
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
plt.grid()
plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
plt.legend(loc="lower right")
return plt
cv = ShuffleSplit(n_splits=30, test_size=0.3, random_state=28)
estimators = [MultinomialNB(), GaussianNB()]
for estimator in estimators:
title = estimator
plot_learning_curve(estimator, title, X, y, ylim=(0.5, 1.0), cv=cv, n_jobs=1)
plt.show()
第7章 K最邻近算法
import numpy as np
import pandas as pd
data = pd.read_csv("Admission_Predict.csv")
data.head()
data.info()
data.describe()
df = pd.read_csv("Admission_Predict.csv")
df.drop(['Serial No.'], axis=1, inplace=True)
x = df.drop(['Chance of Admit '], axis=1)
y = df['Chance of Admit '].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=0)
print('X_train shape:{}'.format(X_train.shape))
print('X_test shape:{}'.format(X_test.shape))
print('y_train shape:{}'.format(y_train.shape))
print('y_train shape:{}'.format(y_train.shape))
from sklearn.neighbors import KNeighborsRegressor
reg = KNeighborsRegressor()
reg.fit(X_train, y_train)
print('验证集得分:{:.2f}'.format(reg.score(X_test, y_test)))
reg2 = KNeighborsRegressor(n_neighbors=2)
reg2.fit(X_train, y_train)
print('模型参数n_neighbors=2的验证集得分:{:.2f}'.format(reg2.score(X_test, y_test)))
print('模型参数n_neighbors=2的训练集得分:{:.2f}'.format(reg2.score(X_train, y_train)))
reg10 = KNeighborsRegressor(n_neighbors=10)
reg10.fit(X_train, y_train)
print('模型参数n_neighbors=10的验证集得分:{:.2f}'.format(reg10.score(X_test, y_test)))
print('模型参数n_neighbors=10的训练集得分:{:.2f}'.format(reg10.score(X_train, y_train)))
reg_w = KNeighborsRegressor(weights='distance')
reg_w.fit(X_train, y_train)
print('模型参数weights=distance的验证集得分:{:.2f}'.format(reg_w.score(X_test, y_test)))
print('模型参数weights=distance的训练集得分:{:.2f}'.format(reg_w.score(X_train, y_train)))
from sklearn.preprocessing import MinMaxScaler
x_2 = MinMaxScaler().fit_transform(x)
print(x_2)
X_train_pp, X_test_pp, y_train, y_test = train_test_split(x_2, y, random_state=0)
reg_scaled = KNeighborsRegressor()
reg_scaled.fit(X_train_pp, y_train)
print('模型预处理后的模型验证集得分:{:.2f}'.format(reg_scaled.score(X_test_pp, y_test)))
print('模型预处理后的模型训练集得分:{:.2f}'.format(reg_scaled.score(X_train_pp, y_train)))
X_new = np.array([[337, 118, 4, 4.5, 4.5, 9.65, 11]])
prediction = reg.predict(X_new)
print("K最邻近算法模型预测结果为:")
print("预测小P同学的综合评分为:{}".format(prediction))
prediction = reg10.predict(X_new)
print("K最邻近算法模型预测结果为:")
print("预测小P同学的综合评分为:{}".format(prediction))
第8章 神经网络
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
data = pd.read_csv("fashion-mnist_train.csv", sep=',')
data.keys()
print('样本数量:{},样本特征数:{}'.format(data.shape[0], data.shape[1]))
X = data.drop(['9'], axis=1)
y = data['9']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=6000, test_size=1200, random_state=42)
X_train = X_train / 255.
X_test = X_test / 255.
mlp_2layers = MLPClassifier(solver='lbfgs', hidden_layer_sizes=[100, 100], activation='relu', alpha=1e-6, random_state=42)
mlp_2layers.fit(X_train, y_train)
print('验证数据集得分:{:.2f}%'.format(mlp_2layers.score(X_test, y_test) * 100))
class_t = [
"0:T-shirt/top",
"1:Trouser",
"2:Pullover",
"3:Dress",
"4:Coat",
"5:Sandal",
"6:Shirt",
"7:Sneaker",
"8:Bag",
"9:AnkleBoot",
]
def get_label_class(label):
return class_t[label]
get_label_class(4)
import numpy as np
from PIL import Image
image = Image.open("picture.jpg").convert('F')
image = image.resize((28, 28))
arr = []
for i in range(28):
for j in range(28):
pixel = float(image.getpixel((j, i))) / 255.
arr.append(pixel)
arr1 = np.array(arr).reshape(1, -1)
ret_class = mlp_2layers.predict(arr1)[0]
print('对图像识别出的分类是:[{}]'.format(get_label_class(ret_class)))