数据集:
数据集获取,百度网盘:
复制这段内容后打开百度网盘手机App,操作更方便哦 链接:
https://pan.baidu.com/s/1i9PAcOQ9g15qtMyiCvl3lQ
提取码:6z30
上代码:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
import seaborn as sns
import numpy as np
# 前三个函数是为了把.xlsx文件转化为.csv
def get_excel_sheet_list(filename):
df = pd.read_excel(filename, sheet_name=None,engine='openpyxl')
return df.keys()
def get_excel_sheet_data(filename, sheetname=None):
df = pd.read_excel(filename, sheet_name=sheetname,engine='openpyxl')
dataList = df.to_dict(orient='records') # 转换为列表
return dataList
def file_trans(Read_filepath):
sheet_list = get_excel_sheet_list(Read_filepath)
file_list=[]
for file in sheet_list:
Save_filepath = file + '.csv'
file_list.append(Save_filepath)
sheetData = pd.DataFrame(get_excel_sheet_data(Read_filepath, file))
if sheetData.empty:
continue
sheetData.to_csv(Save_filepath, index=False, mode='w', encoding='utf-8')
return file_list
# 补足缺省值和返回训练数据,切片
def getDataSet():
Read_filepath = "../xiangmu/pythonProject/数据集_25.xlsx"
list_file = file_trans(Read_filepath)
df = pd.read_csv(list_file[0])
# 所用的数据
df_usage=df.iloc[:,:]
df_usage.columns=['年龄','阶段','地区','文化程度','家庭教养','孩子性格','家庭情感','孩子应对','自我效能','交流情况','上网目的','创伤成长']
df_index = df_usage.columns.values
# df_usage = df_usage.reindex(np.random.permutation(df_usage.index))
# print(df_index)
# 数据转化为列表
df_list=df_usage.values.tolist()
return df_list,df_index
def splitData(aaaaaaaaaa,bbbbbbbbbb):
nrow = len(aaaaaaaaaa)
ncol = len(bbbbbbbbbb)
nxval=5
test_data=[]
train_data=[]
for ixval in range(nxval):
# 每一次循环将count置为零
test_index = [a for a in range(nrow) if a % nxval == ixval]
train_index = [a for a in range(nrow) if a % nxval != ixval]
test_data = [aaaaaaaaaa[r] for r in test_index]
train_data = [aaaaaaaaaa[r] for r in train_index]
x_train=[]
y_train=[]
x_test=[]
y_test=[]
for i in test_data:
x_test.append(i[:-1])
y_test.append(i[-1])
for j in train_data:
x_train.append(j[:-1])
y_train.append(j[-1])
return x_train,y_train,x_test,y_test
aa,bb=getDataSet()
x_train,y_train,x_test,y_test=splitData(aa,bb)
## 定义 决策树模型
clf = DecisionTreeClassifier(criterion='entropy',max_depth=10,min_samples_leaf=2,min_samples_split=2)
# 在训练集上训练决策树模型
clf.fit(x_train, y_train)
## 在训练集和测试集上分布利用训练好的模型进行预测
train_predict = clf.predict(x_train)
test_predict = clf.predict(x_test)
from sklearn import metrics
## 利用accuracy(准确度)【预测正确的样本数目占总预测样本数目的比例】评估模型效果
print('训练准确率:',metrics.accuracy_score(y_train,train_predict))
print('测试准确率:',metrics.accuracy_score(y_test,test_predict))
## 查看混淆矩阵 (预测值和真实值的各类情况统计矩阵)
confusion_matrix_result = metrics.confusion_matrix(test_predict,y_test)
print('混淆矩阵:\n',confusion_matrix_result)
# 利用热力图对于结果进行可视化
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix_result, annot=True, cmap='BrBG_r')
plt.xlabel('true')
plt.ylabel('predict')
plt.show()
结果:
准确率达到0.97,已经是很高的数值了