信用智能评分1--初步实现

最新推荐文章于 2024-07-02 19:40:44 发布

笔尖微凉

最新推荐文章于 2024-07-02 19:40:44 发布

阅读量483

点赞数

分类专栏：机器学习文章标签：信用数据挖掘随机森林支持向量机岭回归

本文链接：https://blog.csdn.net/weixin_42263508/article/details/90635031

版权

机器学习专栏收录该内容

20 篇文章 0 订阅

订阅专栏

信用智能评分
大赛地址：https://www.datafountain.cn/competitions/337/datasets

程序：

#coding:utf-8
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

#导入数据
data = pd.read_csv("train_dataset.csv",header = 0,error_bad_lines=False,encoding="gbk")
# data = pd.read_csv("train_dataset.csv",header = 0,error_bad_lines=False)
# test = pd.read_csv("test_dataset.csv",header = 0,error_bad_lines=False)
print(data.head())#查看data信息
# print(data.describe())
"""
#测试有误缺失值，经测试出无缺失值，所以可以不运行
# total = test.isnull().sum().sort_values(ascending=False)
# print(total)
"""

# print(data.corr()['当月网购类应用使用次数'])#查看各特征间的相关性
# print(data.corr().describe() )#查看相关性的信息
# print("\n统计某一列中各个元素值出现的次数"
# print(data.value_counts())
# print("\n列出数据的偏斜度")
# print(data.skew())
#
# print("\n列出数据的峰度")
# print(data.kurt())

#特征工程







# 从数据中，把相关度比较高的特征以及target踢掉
dataRel = data.drop(['用户近6个月平均消费值（元）',"用户编码"], axis=1)
x_feature = list(dataRel.columns)
x_feature.remove('信用分')
x_val = dataRel[x_feature]
y_val = dataRel['信用分']

#1. 特征向量化
    #将连续值的属性放入一个dict中---无连续值
    # 把离散值的属性放到一个dict中
from sklearn.feature_extraction import DictVectorizer
X_dictCon = x_val.T.to_dict().values()
# y_dictCon = y_val.T.to_dict().values()
# print(X_dictCon)
# print(y_dictCon)

# 2.向量化特征,即将上述的编成的特征与数字一一对应，进行组成数组
    #标准化连续值特征--连续值--无连续值
    #类别特征编码--离散值
vec = DictVectorizer(sparse = False)
X_vec_con = vec.fit_transform(X_dictCon)
# y_vec_con = vec.fit_transform(y_dictCon)
# print("向量化特征:",X_vec_con)


#类别特征编码
#最常用的当然是one-hot编码咯，比如颜色 红、蓝、黄 会被编码为[1, 0, 0]，[0, 1, 0]，[0, 0, 1]
# from sklearn import preprocessing
# #one-hot编码
# enc = preprocessing.OneHotEncoder()
# enc.fit(X_vec_con)
# X_vec_con = enc.transform(X_vec_con).toarray()
# print("类别特征编码：",X_vec_con)

X_vec = X_vec_con
Y = data['信用分'].values





# 1.岭回归
from sklearn.model_selection import train_test_split
# from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
print("\n1.岭回归")
x_train,x_test,y_train,y_test = train_test_split(X_vec,Y,test_size=0.25,random_state=1)
clf = LinearRegression()
clf.fit(x_train, y_train)
t=clf.predict(x_test)
print(t)
print("y_test:%s"%y_test)
print("训练集准确率:{},测试集准确率:{}".format(
        clf.score(x_train, y_train),
        clf.score(x_test, y_test)
))

# 2.支持向量机--为什么分类的SVM可以用来做回归
from sklearn.svm import SVR
print("\n2.支持向量机")
clf = SVR(kernel='rbf', C=10, gamma=0.001)
clf.fit(x_train, y_train)
t=clf.predict(x_test)
print(t)
print("y_test:%s"%y_test)
print("训练集准确率:{},测试集准确率:{}".format(
        clf.score(x_train, y_train),
        clf.score(x_test, y_test)
))

# 3.随机森林
print("\n3.随机森林")
# from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

clf = RandomForestRegressor(n_estimators=80)
clf.fit(x_train, y_train)
t=clf.predict(x_test)
print(t)
print("y_test:%s"%y_test)
print("训练集准确率:{},测试集准确率:{}".format(
        clf.score(x_train, y_train),
        clf.score(x_test, y_test)
))

# 4.用网格搜索给随机森林找一组参数，然后在用随机森林预测
print("\n4.用网格搜索给随机森林找一组参数，然后在用随机森林预测")
from sklearn.model_selection import GridSearchCV
params = {"n_estimators":[30,60,90]}
scores = ['r2']
for score in scores:
    print(score)
    clf = GridSearchCV(RandomForestRegressor(),params,cv=5,scoring=score)
    clf.fit(x_train,y_train)
    print(clf.best_estimator_)

clf = RandomForestRegressor(bootstrap=True,criterion='mse',max_depth=None,
                            max_features = "auto",max_leaf_nodes = None,
                            min_impurity_decrease=0.0,min_impurity_split=None,
                            min_samples_leaf=1,min_samples_split=2,
                            min_weight_fraction_leaf =0.0,n_estimators = 90,n_jobs = 1,
                            oob_score=False,random_state = None,verbose=0,warm_start=False
                            )
clf.fit(x_train, y_train)
t=clf.predict(x_test)
print(t)
print("y_test:%s"%y_test)
print("训练集准确率:{},测试集准确率:{}".format(
        clf.score(x_train, y_train),
        clf.score(x_test, y_test)
))

结果：

"D:\Program Files\Anaconda3\python.exe" "F:/data/Group Image of Consumers-----Intelligent Scoring of Credits/Intelligent Scoring of Credits.py"
                               用户编码  用户实名制是否通过核实  用户年龄  是否大学生客户  是否黑名单客户  \
0  a4651f98c82948b186bdcdc8108381b4            1    44        0        0   
1  aeb10247db4e4d67b2550bbc42ff9827            1    18        0        0   
2  5af23a1e0e77410abb25e9a7eee510aa            1    47        0        0   
3  43c64379d3c24a15b8478851b22049e4            1    55        0        0   
4  f1687f3b8a6f4910bd0b13eb634056e2            1    40        0        0   

   是否4G不健康客户  用户网龄（月）  用户最近一次缴费距今时长（月）  缴费用户最近一次缴费金额（元）  用户近6个月平均消费值（元） ...   \
0          0      186                1            99.80          163.86 ...    
1          1        5                1            29.94          153.28 ...    
2          0      145                1            49.90          109.64 ...    
3          0      234                1            99.80           92.97 ...    
4          0       76                1            49.90           95.47 ...    

   当月是否景点游览  当月是否体育场馆消费  当月网购类应用使用次数  当月物流快递类应用使用次数  当月金融理财类应用使用总次数  \
0         1           1          713              0            2740   
1         0           0          414              0            2731   
2         0           0         3391              0               0   
3         1           1          500              0            1931   
4         1           0          522              0              64   

   当月视频播放类应用使用次数  当月飞机类应用使用次数  当月火车类应用使用次数  当月旅游资讯类应用使用次数  信用分  
0           7145            0            0             30  664  
1          44862            0            0              0  530  
2           4804            0            0              1  643  
3           3141            0            0              5  649  
4             59            0            0              0  648  

[5 rows x 30 columns]


1.岭回归
[ 603.24494938  610.69884317  671.30386921 ...,  613.49420061  628.40605349
  581.6950078 ]
y_test:[645 586 684 ..., 643 644 610]
训练集准确率:0.5526331407565183,测试集准确率:0.5605114966508751

2.支持向量机
[ 631.42026184  615.31989263  631.4418242  ...,  631.43086289  631.43086289
  631.42752693]
y_test:[645 586 684 ..., 643 644 610]
训练集准确率:0.5144571757119787,测试集准确率:0.28462963911419925

3.随机森林
[ 632.6625  628.8     668.2625 ...,  644.7625  642.0875  592.625 ]
y_test:[645 586 684 ..., 643 644 610]
训练集准确率:0.9643187750448226,测试集准确率:0.7543190000574976

4.用网格搜索给随机森林找一组参数，然后在用随机森林预测
r2
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=90, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
[ 630.1         627.04444444  665.32222222 ...,  645.76666667  638.43333333
  590.68888889]
y_test:[645 586 684 ..., 643 644 610]
训练集准确率:0.9643815344347733,测试集准确率:0.7547775698436044

Process finished with exit code 0