文章目录
前言
代码是从Jupyter Notebook导出来的
过程中借鉴了些的数据清洗写法,有时间再补充。
好记性不如烂笔头,免得下次又到处查语法。
py版本
# -*- coding: utf-8 -*-
# @Time : 18-11-1 上午10:43
# @Author : wanghai
# @Email :
# @File : testt.py
# @Software: PyCharm Community Edition
# coding: utf-8
# In[1]:
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.cross_validation import train_test_split
# In[2]:
raw_df = pd.read_csv('data.csv')
df1 = raw_df.drop(['apply_id'], axis=1)
# 异常值是否多
df1.describe()
# In[3]:
def scatterplot(x_data, y_data, area, alpha, x_label="", y_label="", title="", color="g"):
plt.scatter(x, y, s=area, alpha=alpha, c=color)
plt.title(title)
plt.xlabel(x_label)
plt.ylabel(y_label)
plt.legend(loc='upper left')
plt.show()
# # 数据清洗,标签准备
# In[4]:
# 应付实付时间差
df1['date'] = (pd.to_datetime(df1['act_repay_dt']) - pd.to_datetime(df1['plan_repay_dt'])).dt.total_seconds() / (
24 * 60 * 60)
# 可视化
x = df1['date']
y = x
area = np.pi * 3
scatterplot(x, y, area, 0.7, x_label="date", y_label="y", title="pay time img")
# In[5]:
date_show = df1['date'].dropna()
# matplotlib histogram
plt.hist(date_show, facecolor='blue', edgecolor='black', bins=155)
# kdeplot(核密度估计图)
sns.distplot(date_show, hist=True, kde=False,
bins=500, color='blue',
hist_kws={'edgecolor': 'black'})
plt.title('Histogram of pay date')
plt.xlabel('pay date')
plt.ylabel('people count')
plt.show()
# In[6]:
print('The shape of our features is:', df1.shape)
# 标签准备
df1['y'] = np.where((pd.isnull(df1['act_repay_dt'])) | (df1['date'] > 7), 1, 0)
illegal = df1[(pd.isnull(df1['act_repay_dt'])) | (df1['date'] > 7)]
print("至今未还款或者还款时间逾期的人有 %d 人,占比 %.3f" % (len(illegal), float(len(illegal)) / float(len(df1))))
columns = ['act_repay_dt', 'plan_repay_dt', 'date']
# 删除干扰列(初步)
df1.drop(columns, inplace=True, axis=1)
# 删除最大最小的100行(TODO:该方法有待改进)
columns = df1.columns.tolist()
for col in columns:
indexs = df1.nlargest(3, columns=[col]).index.values
for i in indexs:
df1.drop(i, inplace=True)
print('The shape of our features after del is:', df1.shape)
# TODO:计算相关性,干掉相关系数特别高的
# In[7]:
df1.head(3)
# # 均值填充空值
# In[8]:
df1 = df1.fillna(df1.mean())
x = np.array(df1.iloc[:, 0:-1])
y = np.array(df1.iloc[:, -1])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=11)
# dt = DictVectorizer(sparse=False)
# x_train = dt.fit_transform(x_train.to_dict())
# x_test = dt.fit_transform(x_test.to_dict())
print('Training Features Shape:', x_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', x_test.shape)
print('Testing Labels Shape:', y_test.shape)
# In[9]:
# # 决策树版本
# dtc = DecisionTreeClassifier()
# dtc.fit(x_train, y_train)
# dt_predict = dtc.predict(x_test)
# print(dtc.score(x_test, y_test))
# print(classification_report(y_test, dt_predict, target_names=["died", "survived"]))
# 随机森林版本
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)
rfc_y_predict = rfc.predict(x_test)
# 返回给定测试数据和标签的平均精度。
print("均值填充平均精度为:{:.2f}".format(rfc.score(x_test, y_test)))
# In[11]:
print("The accuracy/recall rate and other results are as follows:")
print(classification_report(y_test, rfc_y_predict, target_names=["plan_repay", "overdue_repay"]))
# In[12]:
print(rfc_y_predict)
# In[13]:
print(y_test)
# In[14]:
# 特征重要性
print(rfc.feature_importances_)
markdown版本
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.cross_validation import train_test_split
/home/c/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
"This module will be removed in 0.20.", DeprecationWarning)
raw_df = pd.read_csv('data.csv')
df1 = raw_df.drop(['×××'], axis = 1)
# 异常值是否多
df1.describe()
def scatterplot(x_data, y_data, area, alpha, x_label="", y_label="", title="", color = "g"):
plt.scatter(x, y, s=area, alpha=alpha, c=color)
plt.title(title)
plt.xlabel(x_label)
plt.ylabel(y_label)
plt.legend(loc='upper left')
plt.show()
# 数据清洗,标签准备
# 应付实付时间差
df1['date'] = (pd.to_datetime(df1['×××']) - pd.to_datetime(df1['×××'])).dt.total_seconds()/(24*60*60)
# 可视化
x = df1['date']
y = x
area = np.pi*3
scatterplot(x, y, area, 0.7, x_label="date", y_label="y", title="pay time img")
date_show = df1['date'].dropna()
# matplotlib histogram
plt.hist(date_show, facecolor = 'blue', edgecolor = 'black',bins = 155)
# kdeplot(核密度估计图)
sns.distplot(date_show, hist=True, kde=False,
bins=500, color = 'blue',
hist_kws={'edgecolor':'black'})
plt.title('Histogram of pay date')
plt.xlabel('pay date')
plt.ylabel('people count')
plt.show()
/home/c/anaconda2/lib/python2.7/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg.
warnings.warn("The 'normed' kwarg is deprecated, and has been "
print('The shape of our features is:', df1.shape)
# 标签准备
df1['y'] = np.where((pd.isnull(df1['act_repay_dt'])) | (df1['date'] > 7), 1, 0)
illegal = df1[(pd.isnull(df1['act_repay_dt'])) | (df1['date']>7)]
print("至今未还款或者还款时间逾期的人有 %d 人,占比 %.3f" % (len(illegal), float(len(illegal)) / float(len(df1))))
columns = ['act_repay_dt', 'plan_repay_dt', 'date']
# 删除干扰列(初步)
df1.drop(columns, inplace=True, axis=1)
# 删除最大最小的100行(TODO:该方法有待改进)
columns = df1.columns.tolist()
for col in columns:
indexs = df1.nlargest(3, columns=[col]).index.values
for i in indexs:
df1.drop(i, inplace=True)
print('The shape of our features after del is:', df1.shape)
# TODO:计算相关性,干掉相关系数特别高的
('The shape of our features is:', (12154, 221))
至今未还款或者还款时间逾期的人有 1837 人,占比 0.151
('The shape of our features after del is:', (11497, 219))
df1.head(3)
# 均值填充空值
df1 = df1.fillna(df1.mean())
x = np.array(df1.iloc[:,0:-1])
y = np.array(df1.iloc[:,-1])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=11)
# dt = DictVectorizer(sparse=False)
# x_train = dt.fit_transform(x_train.to_dict())
# x_test = dt.fit_transform(x_test.to_dict())
print('Training Features Shape:', x_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', x_test.shape)
print('Testing Labels Shape:', y_test.shape)
('Training Features Shape:', (8047, 218))
('Training Labels Shape:', (8047,))
('Testing Features Shape:', (3450, 218))
('Testing Labels Shape:', (3450,))
# # 决策树版本
# dtc = DecisionTreeClassifier()
# dtc.fit(x_train, y_train)
# dt_predict = dtc.predict(x_test)
# print(dtc.score(x_test, y_test))
# print(classification_report(y_test, dt_predict, target_names=["died", "survived"]))
# 随机森林版本
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)
rfc_y_predict = rfc.predict(x_test)
# 返回给定测试数据和标签的平均精度。
print("均值填充平均精度为:{:.2f}".format(rfc.score(x_test,y_test)))
均值填充平均精度为:0.86
print("The accuracy/recall rate and other results are as follows:")
print(classification_report(y_test, rfc_y_predict, target_names=["plan_repay", "overdue_repay"]))
The accuracy/recall rate and other results are as follows:
precision recall f1-score support
plan_repay 0.87 0.99 0.92 2976
overdue_repay 0.33 0.04 0.07 474
avg / total 0.79 0.86 0.81 3450
rfc_y_predict
array([0, 0, 0, ..., 0, 0, 0])
y_test
array([0, 0, 0, ..., 0, 0, 0])
# 特征重要性
rfc.feature_importances_
调优
max_features、n_estimators、min_samples_leaf
可参考,CSDN–BYR_jiandong:随机森林的几个重要参数
设置交叉验证
cv_parameter = [{'min_samples_leaf':[5,15,25,35], 'n_estimators':[50,200,500], 'max_depth' = [2, 3, 5]}]
n_jobs并行
clf = GridSearchCV(estimator=rfc,param_grid=cv_parameter, cv=5, n_jobs=1)
max_depth :
整数或None,可选(默认=None)
树的最大深度。如果为None,则扩展节点直到所有叶子都是纯的或直到所有叶子包含少于min_samples_split样本。
from sklearn.model_selection import GridSearchCV
rfc = RandomForestClassifier(max_features = 'sqrt', random_state = 3)
cv_parameter = [{'n_estimators':[50,200,500], 'min_samples_leaf':[5,15,25,35], 'max_depth':[2, 3, 5]}]
clf = GridSearchCV(estimator=rfc,param_grid=cv_parameter, cv=5, n_jobs=1)
clf.fit(x_train, y_train)
print('Best parameters:')
print(clf.best_params_)
设置权重
rfc = RandomForestClassifier(random_state = 3, class_weight={0: 1, 1: 5})
关于结果classification_report
预测出25个正样本,对了11个,共474个真实正样本。准确率0.44, 召回率0.023