python EDA数据分析例子(二分类问题,源代码)

import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import kernel_approximation
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import warnings
warnings.filterwarnings("ignore")
data = pd.read_csv('https://raw.githubusercontent.com/wzy6642/Machine-Learning-Case-Studies/master/noshowappointments/data/No-show-Issue-Comma-300k.csv')
print(len(data))
data.head()
for column in list(data.columns):
    # {0:25}意味着第一个索引(即列)中的特征将被打印,并且将为其分配25个字符空间。
    # nuinque()是查看该序列(axis=0/1对应着列或行)的不同值的数量。用这个函数可以查看数据有多少个不同值。
    print("{0:25} {1}".format(column, data[column].nunique()))
def features_plots(discrete_vars):
    plt.figure(figsize=(15, 24.5))
    for i, cv in enumerate(['Age', 'AwaitingTime']):
        plt.subplot(7, 2, i+1)
        # 这个参数指定bin(箱子)的个数,也就是总共有几条条状图
        plt.hist(data[cv], bins=len(data[cv].unique()))
        plt.title(cv)
        plt.ylabel('Frequency')
    for i, dv in enumerate(discrete_vars):
        plt.subplot(7, 2, i+3)
        data[dv].value_counts().plot(kind='bar', title=dv)
        plt.ylabel('Frequency')
discrete_vars = ['Gender', 'DayOfTheWeek', 'Status', 'Diabetes', 'Alcoolism', 'HiperTension',
                 'Handcap', 'Smokes', 'Scholarship', 'Tuberculosis', 'Sms_Reminder']
features_plots(discrete_vars)
data[data['Age'] < 0]['Age'].value_counts().sum()
data = data[data['Age'] >= 0]
del data['Handcap']
data['AwaitingTime'] = data['AwaitingTime'].apply(lambda x: abs(x))
dow_mapping = {'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3, 'Friday': 4, 'Saturday': 5, 'Sunday': 6}
data['DayOfTheWeek'] = data['DayOfTheWeek'].map(dow_mapping)
for field in ['Gender', 'Status']:
    # pd.Categorical( list ).codes 这样就可以直接得到原始数据的对应的序号列表,通过这样的处理可以将类别信息转化成数值信息
    data[field] = pd.Categorical(list(data[field])).codes
discrete_vars = ['Gender', 'DayOfTheWeek', 'Status', 'Diabetes', 'Alcoolism', 'HiperTension',
                 'Smokes', 'Scholarship', 'Tuberculosis', 'Sms_Reminder']
features_plots(discrete_vars)

plt.scatter(data['Age'], data['AwaitingTime'], s=0.5)
plt.title('Scatter plot of Age and Awaiting Time')
plt.xlabel('Age')
plt.ylabel('Awaiting Time')
plt.xlim(0, 120)
plt.ylim(0, 120)
pd.set_option('display.width', 100)
pd.set_option('precision', 3)
correlations = data[['Age', 'AwaitingTime']].corr(method='pearson')
print(correlations)
data_dow_status = data.groupby(['Sms_Reminder', 'Status'])['Sms_Reminder'].count().unstack('Status').fillna(0)
data_dow_status[[0, 1]].plot(kind='bar', stacked=True)
plt.title('Frequency of people showing up and not showing up by number of SMS reminders sent')
plt.xlabel('Number of SMS reminders')
plt.ylabel('Frequency')

data_dow_status = data.groupby(['DayOfTheWeek', 'Status'])['DayOfTheWeek'].count().unstack('Status').fillna(0)
data_dow_status[[0, 1]].plot(kind='bar', stacked=True)
plt.title('Frequency of people showing up and not showing up by Day of the week')
plt.xlabel('Day of the week')
plt.ylabel('Frequency')
data.boxplot(column=['Age'], return_type='axes', by='Status')
plt.show()
plt.figure(figsize=(15, 3.5))
for i, status in enumerate(['no show ups', 'show ups']):
    data_show = data[data['Status']==i]
    plt.subplot(1, 2, i+1)
    for gender in [0, 1]:
        data_gender = data_show[data_show['Gender']==gender]
        freq_age = data_gender['Age'].value_counts().sort_index()
        freq_age.plot()
    plt.title('Age wise frequency of patient %s for both genders' % status)
    plt.xlabel('Age')
    plt.ylabel('Frequency')
    plt.legend(['Female', 'Male'], loc='upper left')

data.boxplot(column=['AwaitingTime'], return_type='axes', by='Status')
plt.show()
for col in ['AppointmentRegistration', 'ApointmentData']:
    for index, component in enumerate(['year', 'month', 'day']):
        data['%s_%s' % (col, component)] = data[col].apply(lambda x: int(x.split('T')[0].split('-')[index]))
    for index, component in enumerate(['hour', 'min', 'sec']):
        data['%s_%s' % ('AppointmentRegistration', component)] = data['AppointmentRegistration'].apply(lambda x: int(x.split('T')[1][:-1].split(':')[index]))
data.head()
#分类
def model_performance(model, model_name, X_train, y_train, y_test, Y_pred):
    print('Model name: %s' % model_name)
    # 分类准确率分数是指所有分类正确的百分比
    print('Test accuracy (Accuracy Score): %f' % metrics.accuracy_score(y_test, Y_pred))
    # 直接根据真实值(必须是二值)、预测值(可以是0/1,也可以是proba值)计算出auc值
    print('Test accuracy (ROC AUC Score): %f' % metrics.roc_auc_score(y_test, Y_pred))
    # 模型精度
    print('Train accuracy: %f' % model.score(X_train, y_train))
    # precision 、recall 、thresholds
    fpr, tpr, thresholds = metrics.precision_recall_curve(y_test, Y_pred)
    # 计算AUC值,其中x,y分别为数组形式,根据(xi,yi)在坐标上的点,生成的曲线,然后计算AUC值
    print('Area Under the Precision-Recall Curve: %f' % metrics.auc(fpr, tpr))
    # 纵坐标:真正率/横坐标:假正率
    false_positive_rate, true_positive_rate, thresholds = metrics.roc_curve(y_test, Y_pred)
    roc_auc = metrics.auc(false_positive_rate, true_positive_rate)
    plt.title('Receiver Operating Characteristic')
    # 绘制ROC_AUC曲线图
    plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.2f' % roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlim([-0.1, 1.2])
    plt.ylim([-0.1, 1.2])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()
features_of_choice = ['Age', 'Gender', 'DayOfTheWeek', 'Diabetes', 'Alcoolism', 'HiperTension', 'Smokes', 'Scholarship', 'Tuberculosis',
                      'Sms_Reminder', 'AwaitingTime', 'AppointmentRegistration_year', 'AppointmentRegistration_month',
                      'AppointmentRegistration_day', 'AppointmentRegistration_hour', 'AppointmentRegistration_min',
                      'AppointmentRegistration_sec', 'ApointmentData_year', 'ApointmentData_month','ApointmentData_day']
x = np.array(data[features_of_choice])
y = np.array(data['Status'])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)
#决策树
clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
model_performance(clf, 'Decision tree classifier', x_train, y_train, y_test, y_pred)
#SGD分类器
#用核近似和SGD分类器训练模型

rbf_feature = kernel_approximation.RBFSampler(gamma=1, random_state=1)
X_train = rbf_feature.fit_transform(x_train)
clf = SGDClassifier()
clf.fit(X_train, y_train)
X_test = rbf_feature.fit_transform(x_test)
Y_pred = clf.predict(X_test)
model_performance(clf, 'Kernel approximation', X_train, y_train, y_test, Y_pred)

#随机森林

clf = RandomForestClassifier()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
model_performance(clf, 'Random Forest', x_train, y_train, y_test, y_pred)

#梯度Boosting
clf = GradientBoostingClassifier(random_state=10, learning_rate=0.1, n_estimators=200, max_depth=5, max_features=10)
clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)
model_performance(clf, 'Grandient Boosting', x_train, y_train, y_test, y_pred)

for feature, score in zip(features_of_choice, list(clf.feature_importances_)):
    print('%s\t\t\t\t\t%f' % (feature, score))

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值