0. 机器学习简单流程
文字版机器学习流程
-
!!!明确原始数据做什么
-
数据的基本处理:pd去处理数据(缺失值,合并表)
-
特征工程:特征进行处理
-
找到合适的算法去预测
模型:算法+数据 -
模型的评估,判定效果
-如果不成功
1—>换算法,调参数
2—>特征工程 -
上线使用
1.KNN算法
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
def knn_iris():
"""
用KNN算法对鸢尾花分类
"""
# 1.获取数据集
iris = load_iris()
# 2.划分数据集 0.3---->是训练是7,预测是3
x_train,x_test,y_train,y_test = train_test_split(iris.data,iris.target,test_size=0.5,random_state=2)
print(x_train.shape)
print(x_test.shape)
# 3.特征工程,标准化
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
# 4.KNN算法预估积
estimator = KNeighborsClassifier(n_neighbors=1)
estimator.fit(x_train,y_train)
# 5.模型评估
# 方法一,直接对比真实值和预测值
y_predict = estimator.predict(x_test)
print("y_predict:\n",y_predict)
print("直接对比真实值和预测值:\n",y_test == y_predict)
# 方法二,计算准确度
# 输入-----> x和y
score = estimator.score(x_test,y_test)
print("准确度为:\n",score)
return None
if __name__ == '__main__':
knn_iris()
(75, 4)
(75, 4)
y_predict:
[0 0 2 0 0 2 0 2 2 0 0 0 0 0 1 1 0 1 2 1 1 1 2 1 1 0 0 2 0 2 2 0 1 2 1 0 2
1 1 2 1 1 2 1 0 1 0 1 0 0 0 1 2 2 0 2 2 2 1 0 0 2 1 1 2 2 1 0 1 0 2 1 1 0
1]
直接对比真实值和预测值:
[ True True True True True True True True True True True True
True True True True True True True True True True True True
True True True True True True True True True True True True
True True True True True True True True True False True True
True True True False True True True True True True False True
True True True True True True True True True True True True
True True True]
准确度为:
0.96
2.网格搜索和交叉验证
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV # 交叉验证和网格搜索
def knn_iriscv():
"""
用KNN算法对鸢尾花分类,添加网格搜索和交叉验证
"""
# 1.获取数据集
iris = load_iris()
# 2.划分数据集
x_train,x_test,y_train,y_test = train_test_split(iris.data,iris.target,test_size=0.5,random_state=2)
# 3.特征工程,标准化
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
# 4.KNN算法预估积
estimator = KNeighborsClassifier()
# 加入网格搜索和交叉验证
# 参数准备
param_dict = {"n_neighbors":[1,2,3,4,5,6,7]}
estimator = GridSearchCV(estimator,param_grid=param_dict,cv=10) # cv=10,是10折交叉验证
estimator.fit(x_train,y_train)
# 5.模型评估 计算准确度
score = estimator.score(x_test,y_test)
# 并自动切换到最好的参数进行预测
# ------------------------------------------------
print("准确度为:\n",score)
# 最佳参数:best_params_
print("最佳参数:\n",estimator.best_params_)
# 最佳结果:best_score_ !!!这个是验证集中最好的结果的
print("最佳结果:\n",estimator.best_score_)
# 最佳估计器:best_estimator_
print("最佳估计器:\n",estimator.best_estimator_)
# 交叉验证结果:estimator.cv_results_
# print("交叉验证结果:\n",estimator.cv_results_)
# ------------------------------------------------
return None
if __name__ == '__main__':
knn_iriscv()
准确度为:
0.96
最佳参数:
{'n_neighbors': 1}
最佳结果:
0.9466666666666667
最佳估计器:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=1, p=2,
weights='uniform')
3. KNN 实战
4.朴素贝叶斯
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import datasets
import numpy as np
def nb_news():
"""
用朴素贝叶斯对新闻进行分类
"""
# 1) 获取数据
news = fetch_20newsgroups(subset="train")
"""
print(len(news.data)) # 一共10000万多条数据
print(news.data[0]) # 查看第一条数据
"""
x_train,x_test,y_train,y_test = train_test_split(news.data, news.target)
# 3) 特征工程:文本特征抽取-tfidf
transfer = TfidfVectorizer()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
# 4) 朴素贝叶斯算法预估器流程
estimator = MultinomialNB()
estimator.fit(x_train,y_train)
# 5) 模型评估
# 计算准确率
score = estimator.score(x_test,y_test)
print("准确率为",score)
return None
nb_news()
准确率为 0.8416401555319901
5. 决策树
from sklearn.tree import DecisionTreeClassifier,export_graphviz # 可视化决策树
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
def decision_iris():
"""
用决策树对鸢尾花进行分类
"""
# 1) 获取数据集
iris = load_iris()
# 2) 划分数据集
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target,random_state=22)
# !! 这时候不同计算距离,所以数据标准化可有可无
# 3) 决策数预估器
estimator = DecisionTreeClassifier() #(criterion="entropy") # 使用信息增益
estimator.fit(x_train,y_train)
# 4) 模型评估 计算准确率
score = estimator.score(x_test,y_test)
print("决策树的准确率为:",score)
# 可视化决策树
export_graphviz(estimator,out_file="./tree.doc",feature_names=iris.feature_names)
return None
decision_iris()
决策树的准确率为: 0.9210526315789473