信用智能评分
大赛地址:https://www.datafountain.cn/competitions/337/datasets
程序:
#coding:utf-8
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
#导入数据
data = pd.read_csv("train_dataset.csv",header = 0,error_bad_lines=False,encoding="gbk")
# data = pd.read_csv("train_dataset.csv",header = 0,error_bad_lines=False)
# test = pd.read_csv("test_dataset.csv",header = 0,error_bad_lines=False)
print(data.head())#查看data信息
# print(data.describe())
"""
#测试有误缺失值,经测试出无缺失值,所以可以不运行
# total = test.isnull().sum().sort_values(ascending=False)
# print(total)
"""
# print(data.corr()['当月网购类应用使用次数'])#查看各特征间的相关性
# print(data.corr().describe() )#查看相关性的信息
# print("\n统计某一列中各个元素值出现的次数"
# print(data.value_counts())
# print("\n列出数据的偏斜度")
# print(data.skew())
#
# print("\n列出数据的峰度")
# print(data.kurt())
#特征工程
# 从数据中,把相关度比较高的特征以及target踢掉
dataRel = data.drop(['用户近6个月平均消费值(元)',"用户编码"], axis=1)
x_feature = list(dataRel.columns)
x_feature.remove('信用分')
x_val = dataRel[x_feature]
y_val = dataRel['信用分']
#1. 特征向量化
#将连续值的属性放入一个dict中---无连续值
# 把离散值的属性放到一个dict中
from sklearn.feature_extraction import DictVectorizer
X_dictCon = x_val.T.to_dict().values()
# y_dictCon = y_val.T.to_dict().values()
# print(X_dictCon)
# print(y_dictCon)
# 2.向量化特征,即将上述的编成的特征与数字一一对应,进行组成数组
#标准化连续值特征--连续值--无连续值
#类别特征编码--离散值
vec = DictVectorizer(sparse = False)
X_vec_con = vec.fit_transform(X_dictCon)
# y_vec_con = vec.fit_transform(y_dictCon)
# print("向量化特征:",X_vec_con)
#类别特征编码
#最常用的当然是one-hot编码咯,比如颜色 红、蓝、黄 会被编码为[1, 0, 0],[0, 1, 0],[0, 0, 1]
# from sklearn import preprocessing
# #one-hot编码
# enc = preprocessing.OneHotEncoder()
# enc.fit(X_vec_con)
# X_vec_con = enc.transform(X_vec_con).toarray()
# print("类别特征编码:",X_vec_con)
X_vec = X_vec_con
Y = data['信用分'].values
# 1.岭回归
from sklearn.model_selection import train_test_split
# from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
print("\n1.岭回归")
x_train,x_test,y_train,y_test = train_test_split(X_vec,Y,test_size=0.25,random_state=1)
clf = LinearRegression()
clf.fit(x_train, y_train)
t=clf.predict(x_test)
print(t)
print("y_test:%s"%y_test)
print("训练集准确率:{},测试集准确率:{}".format(
clf.score(x_train, y_train),
clf.score(x_test, y_test)
))
# 2.支持向量机--为什么分类的SVM可以用来做回归
from sklearn.svm import SVR
print("\n2.支持向量机")
clf = SVR(kernel='rbf', C=10, gamma=0.001)
clf.fit(x_train, y_train)
t=clf.predict(x_test)
print(t)
print("y_test:%s"%y_test)
print("训练集准确率:{},测试集准确率:{}".format(
clf.score(x_train, y_train),
clf.score(x_test, y_test)
))
# 3.随机森林
print("\n3.随机森林")
# from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
clf = RandomForestRegressor(n_estimators=80)
clf.fit(x_train, y_train)
t=clf.predict(x_test)
print(t)
print("y_test:%s"%y_test)
print("训练集准确率:{},测试集准确率:{}".format(
clf.score(x_train, y_train),
clf.score(x_test, y_test)
))
# 4.用网格搜索给随机森林找一组参数,然后在用随机森林预测
print("\n4.用网格搜索给随机森林找一组参数,然后在用随机森林预测")
from sklearn.model_selection import GridSearchCV
params = {"n_estimators":[30,60,90]}
scores = ['r2']
for score in scores:
print(score)
clf = GridSearchCV(RandomForestRegressor(),params,cv=5,scoring=score)
clf.fit(x_train,y_train)
print(clf.best_estimator_)
clf = RandomForestRegressor(bootstrap=True,criterion='mse',max_depth=None,
max_features = "auto",max_leaf_nodes = None,
min_impurity_decrease=0.0,min_impurity_split=None,
min_samples_leaf=1,min_samples_split=2,
min_weight_fraction_leaf =0.0,n_estimators = 90,n_jobs = 1,
oob_score=False,random_state = None,verbose=0,warm_start=False
)
clf.fit(x_train, y_train)
t=clf.predict(x_test)
print(t)
print("y_test:%s"%y_test)
print("训练集准确率:{},测试集准确率:{}".format(
clf.score(x_train, y_train),
clf.score(x_test, y_test)
))
结果:
"D:\Program Files\Anaconda3\python.exe" "F:/data/Group Image of Consumers-----Intelligent Scoring of Credits/Intelligent Scoring of Credits.py"
用户编码 用户实名制是否通过核实 用户年龄 是否大学生客户 是否黑名单客户 \
0 a4651f98c82948b186bdcdc8108381b4 1 44 0 0
1 aeb10247db4e4d67b2550bbc42ff9827 1 18 0 0
2 5af23a1e0e77410abb25e9a7eee510aa 1 47 0 0
3 43c64379d3c24a15b8478851b22049e4 1 55 0 0
4 f1687f3b8a6f4910bd0b13eb634056e2 1 40 0 0
是否4G不健康客户 用户网龄(月) 用户最近一次缴费距今时长(月) 缴费用户最近一次缴费金额(元) 用户近6个月平均消费值(元) ... \
0 0 186 1 99.80 163.86 ...
1 1 5 1 29.94 153.28 ...
2 0 145 1 49.90 109.64 ...
3 0 234 1 99.80 92.97 ...
4 0 76 1 49.90 95.47 ...
当月是否景点游览 当月是否体育场馆消费 当月网购类应用使用次数 当月物流快递类应用使用次数 当月金融理财类应用使用总次数 \
0 1 1 713 0 2740
1 0 0 414 0 2731
2 0 0 3391 0 0
3 1 1 500 0 1931
4 1 0 522 0 64
当月视频播放类应用使用次数 当月飞机类应用使用次数 当月火车类应用使用次数 当月旅游资讯类应用使用次数 信用分
0 7145 0 0 30 664
1 44862 0 0 0 530
2 4804 0 0 1 643
3 3141 0 0 5 649
4 59 0 0 0 648
[5 rows x 30 columns]
1.岭回归
[ 603.24494938 610.69884317 671.30386921 ..., 613.49420061 628.40605349
581.6950078 ]
y_test:[645 586 684 ..., 643 644 610]
训练集准确率:0.5526331407565183,测试集准确率:0.5605114966508751
2.支持向量机
[ 631.42026184 615.31989263 631.4418242 ..., 631.43086289 631.43086289
631.42752693]
y_test:[645 586 684 ..., 643 644 610]
训练集准确率:0.5144571757119787,测试集准确率:0.28462963911419925
3.随机森林
[ 632.6625 628.8 668.2625 ..., 644.7625 642.0875 592.625 ]
y_test:[645 586 684 ..., 643 644 610]
训练集准确率:0.9643187750448226,测试集准确率:0.7543190000574976
4.用网格搜索给随机森林找一组参数,然后在用随机森林预测
r2
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=90, n_jobs=1,
oob_score=False, random_state=None, verbose=0, warm_start=False)
[ 630.1 627.04444444 665.32222222 ..., 645.76666667 638.43333333
590.68888889]
y_test:[645 586 684 ..., 643 644 610]
训练集准确率:0.9643815344347733,测试集准确率:0.7547775698436044
Process finished with exit code 0