课程链接:https://stepik.org/course/4852/syllabus 语言:俄语
文中使用数据:链接: https://pan.baidu.com/s/1PYqtZWdxr8k-_lTGvhgqKQ 提取码: j6im
一、数据训练
from sklearn import tree
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display
from IPython.display import SVG
from sklearn.model_selection import train_test_split
from graphviz import Source
from sklearn.tree import export_graphviz # 导入的是一个函数
from IPython.display import HTML
style="<style>svg{width:20% !important;height:20% !important;}</style>"
HTML(style)
#读取训练数据和检测数据
#此处需要注意,必需要设置参数index_col,否则读取引用有误,将导致最终图形出错
data_train_iris=pd.read_csv("E:/Training/Python/Trying/data/train_iris.csv",index_col = 0)
data_train_iris.head(10)
data_test_iris=pd.read_csv("E:/Training/Python/Trying/data/test_iris.csv",index_col = 0)
data_test_iris.head()
#更改列名
data_train_iris=data_train_iris.rename(columns={"sepal length":"sepal_length","sepal width":"sepal_width","petal length":"petal_length","petal width":"petal_width"})
data_train_iris.head(10)
data_test_iris=data_test_iris.rename(columns={"sepal length":"sepal_length","sepal width":"sepal_width","petal length":"petal_length","petal width":"petal_width"})
data_test_iris.head(10)
#提取模型所用数据,划分X和y
X_train=data_train_iris.drop("species",axis=1)
y_train=data_train_iris.species
X_test=data_test_iris.drop("species",axis=1)
y_test=data_test_iris.species
#设置相关参数,本模型要求从深度1-100进行计算测试
rs = np.random.seed(0)
max_depth_values = range(1,100)
scores_data = pd.DataFrame()
#设置模型
for max_depth in max_depth_values:
clf = tree.DecisionTreeClassifier(criterion='entropy',max_depth = max_depth,random_state=rs)
clf.fit(X_train,y_train)
train_score = clf.score(X_train,y_train)
test_score = clf.score(X_test,y_test)
temp_score_data = pd.DataFrame({"max_depth":[max_depth],
"train_score":[train_score],
"test_score":[test_score]})
scores_data = scores_data.append(temp_score_data)
#转换列,以便画图
scores_data_long = pd.melt(scores_data,id_vars = ["max_depth"],value_vars = ["train_score","test_score"],var_name = "set_type", value_name = "score")
sns.lineplot(x = "max_depth",y = "score", hue = "set_type",data = scores_data_long)
关于用到的pd.melt函数,可查看【Python专题】pandas.melt函数
二、预测
本程序是根据不确定的物种的动物的各种特征,预测是猫还是狗,即预测物种。
from sklearn import tree
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display
from IPython.display import SVG
from sklearn.model_selection import train_test_split
from graphviz import Source
from sklearn.tree import export_graphviz # 导入的是一个函数
from IPython.display import HTML
style="<style>svg{width:20% !important;height:20% !important;}</style>"
HTML(style)
#训练数据读入
data_ori=pd.read_csv("E:/Training/Python/Trying/data/dogs_n_cats.csv")
#data_train.head(10)
data_ori=data_ori.rename(columns={"Лазает по деревьям":"Лазает_по_деревьям"})
data_ori.head(10)
X_ori=data_ori.drop(["Вид"], axis=1)
y_ori=data_ori["Вид"]
rs = np.random.seed(0)
#将训练数据按照0.33分开,以便测试训练结果
X_train,X_test,y_train,y_test = train_test_split(X_ori,y_ori,test_size = 0.33,random_state=rs)
clf = tree.DecisionTreeClassifier(criterion='entropy',max_depth = max_depth,random_state=rs)
clf.fit(X_train,y_train)
#进行预测,并与实际结果进行对比
result = clf.predict(X_test)
pd.Series(result)[result == 'собачка'].count()
pd.Series(y_test)[y_test == 'собачка'].count()
#读入需要预测结果的数据并预测
data_pre=pd.read_json("E:/Training/Python/Trying/data/dataset_209691_15.txt")
#data_test.head()
data_pre=data_pre.rename(columns={"Лазает по деревьям":"Лазает_по_деревьям"})
data_pre.head(10)
X_pre=data_pre
result = clf.predict(X_pre)
pd.Series(result)[result == 'собачка'].count()