记一次随机森林小实践

前言

代码是从Jupyter Notebook导出来的
过程中借鉴了些的数据清洗写法,有时间再补充。
好记性不如烂笔头,免得下次又到处查语法。

py版本

# -*- coding: utf-8 -*-
# @Time    : 18-11-1 上午10:43
# @Author  : wanghai
# @Email   : 
# @File    : testt.py
# @Software: PyCharm Community Edition

# coding: utf-8

# In[1]:


import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.cross_validation import train_test_split

# In[2]:


raw_df = pd.read_csv('data.csv')
df1 = raw_df.drop(['apply_id'], axis=1)
# 异常值是否多
df1.describe()


# In[3]:


def scatterplot(x_data, y_data, area, alpha, x_label="", y_label="", title="", color="g"):
    plt.scatter(x, y, s=area, alpha=alpha, c=color)
    plt.title(title)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.legend(loc='upper left')
    plt.show()


# # 数据清洗,标签准备

# In[4]:


# 应付实付时间差
df1['date'] = (pd.to_datetime(df1['act_repay_dt']) - pd.to_datetime(df1['plan_repay_dt'])).dt.total_seconds() / (
24 * 60 * 60)
# 可视化
x = df1['date']
y = x
area = np.pi * 3
scatterplot(x, y, area, 0.7, x_label="date", y_label="y", title="pay time img")

# In[5]:


date_show = df1['date'].dropna()

# matplotlib histogram
plt.hist(date_show, facecolor='blue', edgecolor='black', bins=155)

# kdeplot(核密度估计图)
sns.distplot(date_show, hist=True, kde=False,
             bins=500, color='blue',
             hist_kws={'edgecolor': 'black'})
plt.title('Histogram of pay date')
plt.xlabel('pay date')
plt.ylabel('people count')
plt.show()

# In[6]:


print('The shape of our features is:', df1.shape)

# 标签准备
df1['y'] = np.where((pd.isnull(df1['act_repay_dt'])) | (df1['date'] > 7), 1, 0)

illegal = df1[(pd.isnull(df1['act_repay_dt'])) | (df1['date'] > 7)]
print("至今未还款或者还款时间逾期的人有 %d 人,占比 %.3f" % (len(illegal), float(len(illegal)) / float(len(df1))))
columns = ['act_repay_dt', 'plan_repay_dt', 'date']
# 删除干扰列(初步)
df1.drop(columns, inplace=True, axis=1)

# 删除最大最小的100行(TODO:该方法有待改进)
columns = df1.columns.tolist()
for col in columns:
    indexs = df1.nlargest(3, columns=[col]).index.values
    for i in indexs:
        df1.drop(i, inplace=True)

print('The shape of our features after del is:', df1.shape)
# TODO:计算相关性,干掉相关系数特别高的


# In[7]:


df1.head(3)

# # 均值填充空值

# In[8]:


df1 = df1.fillna(df1.mean())
x = np.array(df1.iloc[:, 0:-1])
y = np.array(df1.iloc[:, -1])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=11)

# dt = DictVectorizer(sparse=False)
# x_train = dt.fit_transform(x_train.to_dict())
# x_test = dt.fit_transform(x_test.to_dict())

print('Training Features Shape:', x_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', x_test.shape)
print('Testing Labels Shape:', y_test.shape)

# In[9]:


# # 决策树版本
# dtc = DecisionTreeClassifier()

# dtc.fit(x_train, y_train)

# dt_predict = dtc.predict(x_test)

# print(dtc.score(x_test, y_test))

# print(classification_report(y_test, dt_predict, target_names=["died", "survived"]))

# 随机森林版本

rfc = RandomForestClassifier()

rfc.fit(x_train, y_train)

rfc_y_predict = rfc.predict(x_test)
# 返回给定测试数据和标签的平均精度。
print("均值填充平均精度为:{:.2f}".format(rfc.score(x_test, y_test)))

# In[11]:


print("The accuracy/recall rate and other results are as follows:")
print(classification_report(y_test, rfc_y_predict, target_names=["plan_repay", "overdue_repay"]))

# In[12]:


print(rfc_y_predict)

# In[13]:


print(y_test)

# In[14]:


# 特征重要性
print(rfc.feature_importances_)

markdown版本

import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.cross_validation import train_test_split
/home/c/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
raw_df = pd.read_csv('data.csv')
df1 = raw_df.drop(['×××'], axis = 1)
# 异常值是否多
df1.describe()
def scatterplot(x_data, y_data, area, alpha, x_label="", y_label="", title="", color = "g"):
    plt.scatter(x, y, s=area, alpha=alpha, c=color)
    plt.title(title)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.legend(loc='upper left')
    plt.show()

# 数据清洗,标签准备

# 应付实付时间差
df1['date'] = (pd.to_datetime(df1['×××']) - pd.to_datetime(df1['×××'])).dt.total_seconds()/(24*60*60)
# 可视化
x = df1['date']
y = x
area = np.pi*3
scatterplot(x, y, area, 0.7, x_label="date", y_label="y", title="pay time img")

png

date_show = df1['date'].dropna()

# matplotlib histogram
plt.hist(date_show, facecolor = 'blue', edgecolor = 'black',bins = 155)

# kdeplot(核密度估计图)
sns.distplot(date_show, hist=True, kde=False, 
             bins=500, color = 'blue',
             hist_kws={'edgecolor':'black'})
plt.title('Histogram of pay date')
plt.xlabel('pay date')
plt.ylabel('people count')
plt.show()
/home/c/anaconda2/lib/python2.7/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg.
  warnings.warn("The 'normed' kwarg is deprecated, and has been "

在这里插入图片描述

print('The shape of our features is:', df1.shape)

# 标签准备
df1['y'] = np.where((pd.isnull(df1['act_repay_dt'])) | (df1['date'] > 7), 1, 0)

illegal = df1[(pd.isnull(df1['act_repay_dt'])) | (df1['date']>7)]
print("至今未还款或者还款时间逾期的人有 %d 人,占比 %.3f" % (len(illegal), float(len(illegal)) / float(len(df1))))
columns = ['act_repay_dt', 'plan_repay_dt', 'date']
# 删除干扰列(初步)
df1.drop(columns, inplace=True, axis=1)

# 删除最大最小的100行(TODO:该方法有待改进)
columns = df1.columns.tolist()
for col in columns:
    indexs = df1.nlargest(3, columns=[col]).index.values
    for i in indexs:
        df1.drop(i, inplace=True)

print('The shape of our features after del is:', df1.shape)
# TODO:计算相关性,干掉相关系数特别高的

('The shape of our features is:', (12154, 221))
至今未还款或者还款时间逾期的人有 1837 人,占比 0.151
('The shape of our features after del is:', (11497, 219))
df1.head(3)

# 均值填充空值

df1 = df1.fillna(df1.mean())
x = np.array(df1.iloc[:,0:-1])
y = np.array(df1.iloc[:,-1])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=11)

# dt = DictVectorizer(sparse=False)
# x_train = dt.fit_transform(x_train.to_dict())
# x_test = dt.fit_transform(x_test.to_dict())

print('Training Features Shape:', x_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', x_test.shape)
print('Testing Labels Shape:', y_test.shape)
('Training Features Shape:', (8047, 218))
('Training Labels Shape:', (8047,))
('Testing Features Shape:', (3450, 218))
('Testing Labels Shape:', (3450,))
# # 决策树版本
# dtc = DecisionTreeClassifier()
 
# dtc.fit(x_train, y_train)
 
# dt_predict = dtc.predict(x_test)
 
# print(dtc.score(x_test, y_test))
# print(classification_report(y_test, dt_predict, target_names=["died", "survived"]))

# 随机森林版本
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)
rfc_y_predict = rfc.predict(x_test)
# 返回给定测试数据和标签的平均精度。
print("均值填充平均精度为:{:.2f}".format(rfc.score(x_test,y_test)))
均值填充平均精度为:0.86
print("The accuracy/recall rate and other results are as follows:")
print(classification_report(y_test, rfc_y_predict, target_names=["plan_repay", "overdue_repay"]))
The accuracy/recall rate and other results are as follows:
               precision    recall  f1-score   support

   plan_repay       0.87      0.99      0.92      2976
overdue_repay       0.33      0.04      0.07       474

  avg / total       0.79      0.86      0.81      3450
rfc_y_predict
array([0, 0, 0, ..., 0, 0, 0])
y_test
array([0, 0, 0, ..., 0, 0, 0])
# 特征重要性
rfc.feature_importances_

调优

max_features、n_estimators、min_samples_leaf

可参考,CSDN–BYR_jiandong:随机森林的几个重要参数

设置交叉验证

cv_parameter = [{'min_samples_leaf':[5,15,25,35], 'n_estimators':[50,200,500], 'max_depth' = [2, 3, 5]}]
n_jobs并行
clf = GridSearchCV(estimator=rfc,param_grid=cv_parameter, cv=5, n_jobs=1)

max_depth :

整数或None,可选(默认=None)
树的最大深度。如果为None,则扩展节点直到所有叶子都是纯的或直到所有叶子包含少于min_samples_split样本。

from sklearn.model_selection import GridSearchCV
rfc = RandomForestClassifier(max_features = 'sqrt', random_state = 3)
cv_parameter = [{'n_estimators':[50,200,500], 'min_samples_leaf':[5,15,25,35], 'max_depth':[2, 3, 5]}]
clf = GridSearchCV(estimator=rfc,param_grid=cv_parameter, cv=5, n_jobs=1)

clf.fit(x_train, y_train)
print('Best parameters:')
print(clf.best_params_)

在这里插入图片描述

设置权重

rfc = RandomForestClassifier(random_state = 3, class_weight={0: 1, 1: 5})

关于结果classification_report

在这里插入图片描述
预测出25个正样本,对了11个,共474个真实正样本。准确率0.44, 召回率0.023
在这里插入图片描述

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值