import numpy as np
import pandas as pd
df = pd.read_csv(r'hepatitis.data',encoding='utf-8')
# print(df.head())
df.columns=['是否生还', '年龄', '性别', '类固醇', '抗病毒药','疲惫','不适','厌食','肝大','肝硬化','脾可触','蛛状','腹腔积水','静脉曲张','胆红素'
,'碱性磷酸酯','谷草转氨酶','血蛋白','凝血酶原时间','组织学']
# print(df.head())
#查看数据的总体情况:
print(df.shape)
print(df.describe())
#数值替换
df=df.replace('?',np.nan)
df[['是否生还', '年龄', '性别', '类固醇', '抗病毒药','疲惫','不适','厌食','肝大','肝硬化','脾可触','蛛状','腹腔积水','静脉曲张','胆红素'
,'碱性磷酸酯','谷草转氨酶','血蛋白','凝血酶原时间','组织学']]=df[['是否生还', '年龄', '性别', '类固醇', '抗病毒药','疲惫','不适','厌食','肝大','肝硬化','脾可触','蛛状','腹腔积水','静脉曲张','胆红素'
,'碱性磷酸酯','谷草转氨酶','血蛋白','凝血酶原时间','组织学']].astype('float')
print(df.info())
#开始填充空缺值
df_mode=['类固醇','疲惫','不适','厌食','肝大','肝硬化','脾可触','蛛状','腹腔积水','静脉曲张']
for i in df_mode:
df.loc[:,i]=df.loc[:,i].fillna(df.loc[:,i].mode()[0])
#数据填充
#查看众数
num_zhong=df.loc[:,['类固醇','疲惫','不适','厌食','肝大','肝硬化','脾可触','蛛状','腹腔积水','静脉曲张']].mode()
print(num_zhong)
#众数填充
df_mode=['类固醇','疲惫','不适','厌食','肝大','肝硬化','脾可触','蛛状','腹腔积水','静脉曲张']
for i in df_mode:
df.loc[:,i]=df.loc[:,i].fillna(df.loc[:,i].mode()[0])
df_mean=['胆红素','碱性磷酸酯','谷草转氨酶','血蛋白','凝血酶原时间']
#均值填充
for i in df_mean:
df.loc[:,i]=df.loc[:,i].fillna(df.loc[:,i].mean())
print(df.isnull().sum())
#数据分析
from matplotlib import pyplot as plt
import seaborn as sns
sns.set()
'''1、计算数据集中的 治愈/死亡 病例占比
(在“是否生还”这一列中,1代表死亡病例,2代表治愈病例。)
'''
df_counts=df['是否生还'].replace([1,2],['死亡','治愈']).value_counts()
print(df_counts)
print(df['是否生还'].replace([1,2],['死亡','治愈']).value_counts(normalize=True))
explode = (0, 0.2) # 各类别的偏移半径
color = ['g', 'r']
labels =['治愈','死亡']
plt.pie(df_counts, colors=color, explode=explode,labels=labels,
shadow=True, autopct='%1.1f%%') #设置阴影,偏移度,颜色
# 饼状图呈正圆
plt.axis('equal')
plt.legend()
#病患者年龄的各项主要数据:
df_age=df['年龄'].agg(['min','max','mean'])
print(df_age)
#死亡病例年龄的各项主要数据:
df_age_dead = df[df['是否生还']==1]['年龄'].agg(['min','max','mean'])
print(df_age_dead)
#决策树
from sklearn import tree
from sklearn.model_selection import train_test_split
df_x=df.drop(labels='是否生还',axis=1)
df_y=df.loc[:,'是否生还']
print(df_y.head())
print(df_x.head())
#这里我们指定了拆分系数为0.3,也是说将7/10的数据作为训练集,剩下3/10的数据作为测试集。
X_train, X_test, Y_train, Y_test = train_test_split(df_x,df_y,test_size=0.3)
clf = tree.DecisionTreeClassifier(criterion="entropy")
clf = clf.fit(X_train, Y_train)
#查看一下在测试集上的准确度
score = clf.score(X_test, Y_test)
print('决策树的得分情况',score)
#使用GridSearchCV 网格搜索对决策树进行调参并返回最佳参数:
#先计算最优深度:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier, export_graphviz
tree_params = {'max_depth': range(5, 12)}
locally_best_tree = GridSearchCV(DecisionTreeClassifier(random_state=17),
tree_params, cv=5)
locally_best_tree.fit(X_train, Y_train)
print("最优深度:{}".format(locally_best_tree.best_params_))
#设置最优深度为6后,重新计算得分情况:
clf = tree.DecisionTreeClassifier(criterion="entropy",max_depth=6, random_state=17)
clf = clf.fit(X_train, Y_train)
score = clf.score(X_test, Y_test)
print("最优深度下的得分情况:{}".format(score))
'''
#根据结果,我们可以画出一个决策树
feature_name=[ '年龄', '性别', '类固醇', '抗病毒药','疲惫','不适','厌食','肝大','肝硬化','脾可触','蛛状','腹腔积水','静脉曲张','胆红素'
,'碱性磷酸酯','谷草转氨酶','血蛋白','凝血酶原时间','组织学']
import graphviz
clf = tree.DecisionTreeClassifier(criterion="entropy",max_depth=6, random_state=17)
clf = clf.fit(X_train, Y_train)
dot_data = tree.export_graphviz(clf
,out_file = None
,feature_names= feature_name
,class_names=["病逝","治愈"]
,filled=True
,rounded=True
)
graph = graphviz.Source(dot_data)
print(graph)
'''
#建立随机森林模型进行数据分析
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=17)
rf.fit(X_train, Y_train)
#查看拟合的得分情况:
score = rf.score(X_test, Y_test)
print("随机森林的得分情况:{}".format(score))
#朴素贝叶斯分类
from sklearn.naive_bayes import GaussianNB
nb_model = GaussianNB() #高斯朴素贝叶斯,参数设置默认状态
nb_model.fit(X_train, Y_train) #使用训练集训练模型
score = nb_model.score(X_test, Y_test)
print('贝叶斯下得分情况',score)
'''
然后是KNN分类器:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=1)
knn_model.fit(feature_train,target_train) #使用训练集训练模型
predict_results_knn = knn_model.predict(feature_test) #使用模型对测试集进行预测
#查看预测结果
print("predict_results:",predict_results_knn)
print("target_test:",target_test)
print(accuracy_score(predict_results_knn,target_test))
其中,KNeighborClassifier的具体参数包括:
(1) n_neighbors :KNN中的k值,默认值是5
(2) weights :近邻权,标识每个样本的K个近邻样本的权重,可选’uniform’/‘distance’或自定义权重。
(3) metric :距离度量方法
————————————————
版权声明:本文为CSDN博主「蒙氏宝宝」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。
原文链接:https://blog.csdn.net/weixin_33423069/article/details/114054335'''
数据集地址
github地址:git
https://archive.ics.uci.edu/ml/datasets/Hepatitis