学习资料
kaggle
数据集、源文件等:资料,提取码:zmlc
一、导包
import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
myfont = matplotlib.font_manager.FontProperties(fname="./DroidSansFallback.ttf")
plt.rcParams['axes.unicode_minus'] = False
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import scale, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
import warnings
warnings.filterwarnings('ignore')
二、数据探索
data = pd.read_csv("data/healthcare-dataset-stroke-data.csv",encoding='UTF-8')
df=data.copy()
df.head()
del df['id']
df.info()
df.describe()
三、数据可视化
fig = plt.figure()
fig.set(alpha=0.2)
plt.subplot2grid((2,9),(0,0),colspan=2)
df.stroke.value_counts().plot(kind='bar')
plt.title('中分情况(1为中分)',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)
plt.subplot2grid((2,9),(0,3),colspan=2)
df.gender.value_counts().plot(kind='bar')
plt.title('性别情况',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)
plt.subplot2grid((2,9),(0,6),colspan=2)
df.age.plot(kind='kde')
plt.title('年龄情况',fontproperties=myfont)
plt.ylabel('年龄',fontproperties=myfont)
fig = plt.figure()
fig.set(alpha=0.2)
plt.subplot2grid((2,9),(0,0),colspan=2)
df.hypertension.value_counts().plot(kind='bar')
plt.title('高血压情况(1为高血压)',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)
plt.subplot2grid((2,9),(0,3),colspan=2)
df.hypertension.value_counts().plot(kind='bar')
plt.title('心脏病情况',fontproperties=myfont)
plt.ylabel('人数密度',fontproperties=myfont)
plt.subplot2grid((2,9),(0,6),colspan=2)
df.ever_married.value_counts().plot(kind='bar')
plt.title('结婚情况',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)
fig = plt.figure()
fig.set(alpha=0.2)
plt.subplot2grid((2,12),(0,0),colspan=2)
df.work_type.value_counts().plot(kind='bar')
plt.title('工作类型情况',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)
plt.subplot2grid((2,12),(0,3),colspan=2)
df.Residence_type.value_counts().plot(kind='bar')
plt.title('住宅类型情况',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)
plt.subplot2grid((2,12),(0,6),colspan=2)
df.avg_glucose_level.plot(kind='kde')
plt.title('平均血糖',fontproperties=myfont)
plt.ylabel('血糖浓度',fontproperties=myfont)
plt.subplot2grid((2,12),(0,9),colspan=2)
df.smoking_status.value_counts().plot(kind='bar')
plt.title('抽烟情况',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)
四、其他属性与中分关系可视化
tmp = df.drop(['avg_glucose_level','age','bmi','stroke'],axis=1)
for column in tmp.columns:
tmp[column][df.stroke==1].value_counts().plot(kind='bar')
plt.title(column+'=》中分',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)
plt.xlabel(column+'情况',fontproperties=myfont)
plt.show()
五、数据预处理
for column in df.select_dtypes(include=['object']):
if len(np.unique(df[column]))>2:
print(f"==== [COLUMNS: {column}] ====")
print(df[column].value_counts())
df.drop(df[df['gender']=='Other'].index[0],axis=0,inplace=True)
df=pd.get_dummies(df,columns=['work_type'])
df=pd.get_dummies(df,columns=['smoking_status'])
for column in df.select_dtypes(include=['object']):
if len(np.unique(df[column]))<=2:
print(f"==== [COLUMNS: {column}] ====")
print(df[column].value_counts())
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df.gender = le.fit_transform(df.gender)
df.ever_married = le.fit_transform(df.ever_married)
df.Residence_type=le.fit_transform(df.Residence_type)
六、缺失值处理
df.isna().sum()
var_names = list(df)
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=3)
dff = imputer.fit_transform(df)
dff = pd.DataFrame(dff,columns=var_names)
df.bmi=dff.bmi
df.dropna(axis=0,how='any',inplace=True)
df.isna().sum()
七、模型预测
y = df['stroke']
x = df.drop(['stroke'],axis=1)
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=11,shuffle=True)
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
algorithms=[
LogisticRegression,
KNeighborsClassifier,
MLPClassifier,
DecisionTreeClassifier,
RandomForestClassifier,
GaussianNB,
AdaBoostClassifier,
BaggingClassifier
]
score_df = pd.DataFrame()
for algorithm in algorithms:
score = {"算法":algorithm.__name__}
model=algorithm().fit(x_train,y_train)
y_pred=model.predict(x_test)
score["accuracy_score"] = accuracy_score(y_test,y_pred)*100
score_df = score_df.append(score, ignore_index=True)
score_df.sort_values(by="accuracy_score",ascending = False,inplace=True)
score_df
score_df.plot(x='算法',y='accuracy_score',kind='bar')
plt.title('算法排名',fontproperties=myfont)
plt.ylabel('准确率',fontproperties=myfont)
plt.xlabel('算法',fontproperties=myfont)
plt.show()
完整代码
import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
import matplotlib
myfont = matplotlib.font_manager.FontProperties(fname="./DroidSansFallback.ttf")
plt.rcParams['axes.unicode_minus'] = False
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import scale, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
import warnings
warnings.filterwarnings('ignore')
data = pd.read_csv("data/healthcare-dataset-stroke-data.csv",encoding='UTF-8')
df=data.copy()
df.head()
del df['id']
df.info()
df.describe()
fig = plt.figure()
fig.set(alpha=0.2)
plt.subplot2grid((2,9),(0,0),colspan=2)
df.stroke.value_counts().plot(kind='bar')
plt.title('中分情况(1为中分)',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)
plt.subplot2grid((2,9),(0,3),colspan=2)
df.gender.value_counts().plot(kind='bar')
plt.title('性别情况',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)
plt.subplot2grid((2,9),(0,6),colspan=2)
df.age.plot(kind='kde')
plt.title('年龄情况',fontproperties=myfont)
plt.ylabel('年龄',fontproperties=myfont)
fig = plt.figure()
fig.set(alpha=0.2)
plt.subplot2grid((2,9),(0,0),colspan=2)
df.hypertension.value_counts().plot(kind='bar')
plt.title('高血压情况(1为高血压)',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)
plt.subplot2grid((2,9),(0,3),colspan=2)
df.hypertension.value_counts().plot(kind='bar')
plt.title('心脏病情况',fontproperties=myfont)
plt.ylabel('人数密度',fontproperties=myfont)
plt.subplot2grid((2,9),(0,6),colspan=2)
df.ever_married.value_counts().plot(kind='bar')
plt.title('结婚情况',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)
fig = plt.figure()
fig.set(alpha=0.2)
plt.subplot2grid((2,12),(0,0),colspan=2)
df.work_type.value_counts().plot(kind='bar')
plt.title('工作类型情况',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)
plt.subplot2grid((2,12),(0,3),colspan=2)
df.Residence_type.value_counts().plot(kind='bar')
plt.title('住宅类型情况',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)
plt.subplot2grid((2,12),(0,6),colspan=2)
df.avg_glucose_level.plot(kind='kde')
plt.title('平均血糖',fontproperties=myfont)
plt.ylabel('血糖浓度',fontproperties=myfont)
plt.subplot2grid((2,12),(0,9),colspan=2)
df.smoking_status.value_counts().plot(kind='bar')
plt.title('抽烟情况',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)
tmp = df.drop(['avg_glucose_level','age','bmi','stroke'],axis=1)
for column in tmp.columns:
tmp[column][df.stroke==1].value_counts().plot(kind='bar')
plt.title(column+'=》中分',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)
plt.xlabel(column+'情况',fontproperties=myfont)
plt.show()
for column in df.select_dtypes(include=['object']):
if len(np.unique(df[column]))>2:
print(f"==== [COLUMNS: {column}] ====")
print(df[column].value_counts())
df.drop(df[df['gender']=='Other'].index[0],axis=0,inplace=True)
df=pd.get_dummies(df,columns=['work_type'])
df=pd.get_dummies(df,columns=['smoking_status'])
df.info()
for column in df.select_dtypes(include=['object']):
if len(np.unique(df[column]))<=2:
print(f"==== [COLUMNS: {column}] ====")
print(df[column].value_counts())
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df.gender = le.fit_transform(df.gender)
df.ever_married = le.fit_transform(df.ever_married)
df.Residence_type=le.fit_transform(df.Residence_type)
df.isna().sum()
var_names = list(df)
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=3)
dff = imputer.fit_transform(df)
dff = pd.DataFrame(dff,columns=var_names)
df.bmi=dff.bmi
df.dropna(axis=0,how='any',inplace=True)
df.isna().sum()
y = df['stroke']
x = df.drop(['stroke'],axis=1)
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=11,shuffle=True)
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
algorithms=[
LogisticRegression,
KNeighborsClassifier,
MLPClassifier,
DecisionTreeClassifier,
RandomForestClassifier,
GaussianNB,
AdaBoostClassifier,
BaggingClassifier
]
score_df = pd.DataFrame()
for algorithm in algorithms:
score = {"算法":algorithm.__name__}
model=algorithm().fit(x_train,y_train)
y_pred=model.predict(x_test)
score["accuracy_score"] = accuracy_score(y_test,y_pred)*100
score_df = score_df.append(score, ignore_index=True)
score_df.sort_values(by="accuracy_score",ascending = False,inplace=True)
score_df.plot(x='算法',y='accuracy_score',kind='bar')
plt.title('算法排名',fontproperties=myfont)
plt.ylabel('准确率',fontproperties=myfont)
plt.xlabel('算法',fontproperties=myfont)
plt.show()