题目描述
导入数据并查看数据集
数据集地址
"" "
import pandas as pd
import numpy as np
data= pd. read_csv( "credit_card.csv" )
data. info( )
data. describe( )
构建历史行为特征
data_active= data. iloc[ : , [ 2 , 3 , 4 , 6 , 7 , 8 ] ]
data_means= data. iloc[ : , [ 0 ] ]
data_active. describe( )
data_active. info( )
def GetScore ( x) :
if x>= 2 :
a = 0
else :
a = 1
return a
score_1= data_active[ '瑕疵户' ] . apply ( GetScore)
score_2= data_active[ '逾期' ] . apply ( GetScore)
score_3= data_active[ '呆账' ] . apply ( GetScore)
score_4= data_active[ '退票' ] . apply ( GetScore)
score_5= data_active[ '拒往记录' ] . apply ( GetScore)
score_6= data_active[ '强制停卡记录' ] . apply ( GetScore)
data_means. loc[ : , 'history_credit_risk' ] = score_1+ score_2* 2 + score_3* 3 + score_4* 3 + score_5* 3 + score_6
构建经济风险情况特征
data_encomic= data. iloc[ : , [ 5 , 18 , 19 , 21 , 22 ] ]
data_encomic. describe( )
data_encomic. info( )
def GetScore_encomic ( x) :
if x>= 2 :
a = 1
else :
a = 0
return a
score_yu= data_encomic[ '月刷卡额' ] . apply ( GetScore_encomic)
data_person= data_encomic[ '个人月收入' ] / data_encomic[ '个人月开销' ]
data_person_Scores= [ ]
for i in range ( data_encomic. shape[ 0 ] ) :
if data_person[ i] < 1 :
data_person_Scores. append( 0 )
else :
data_person_Scores. append( 1 )
data_mouth= data_encomic[ '家庭月收入' ] / data_encomic[ '月刷卡额' ]
data_mouth_Scores= [ ]
for i in range ( data_encomic. shape[ 0 ] ) :
if data_person[ i] < 1 :
data_mouth_Scores. append( 0 )
else :
data_mouth_Scores. append( 1 )
data_means[ 'economic_risk' ] = np. array( data_mouth_Scores) + np. array( data_person_Scores) + np. array( score_yu)
构建收入风险情况特征
data_shouru= data. iloc[ : , [ 14 , 17 , 20 ] ]
data_shouru. describe( )
data_shouru. info( )
HouseScore = [ ]
for i in range ( data_shouru. shape[ 0 ] ) :
if 3 <= data_shouru. loc[ i, '住家' ] <= 5 :
HouseScore. append( 0 )
else :
HouseScore. append( 1 )
JobScore = [ ]
for i in range ( data_shouru. shape[ 0 ] ) :
if ( data_shouru. loc[ i, '职业' ] <= 7 ) | ( data_shouru. loc[ i, '职业' ] == 19 ) | ( data_shouru. loc[ i, '职业' ] == 21 ) :
JobScore. append( 2 )
if ( data_shouru. loc[ i, '职业' ] >= 8 ) & ( data_shouru. loc[ i, '职业' ] <= 11 ) :
JobScore. append( 1 )
if ( data_shouru. loc[ i, '职业' ] <= 18 ) & ( data_shouru. loc[ i, '职业' ] >= 12 ) | ( data_shouru. loc[ i, '职业' ] == 20 ) | ( data_shouru. loc[ i, '职业' ] == 22 ) :
JobScore. append( 0 )
AgeScore = [ ]
for i in range ( data_shouru. shape[ 0 ] ) :
if data_shouru. loc[ i, '年龄' ] <= 2 :
AgeScore. append( 1 )
else :
AgeScore. append( 0 )
data_means[ 'income_risk' ] = np. array( HouseScore) + np. array( JobScore) + np. array( AgeScore)
聚类分析
from sklearn. preprocessing import StandardScaler
sc_X = StandardScaler( )
data_means_stander = sc_X. fit_transform( data_means. iloc[ : , [ 1 , 2 , 3 ] ] )
from sklearn. cluster import KMeans
k = 5
kmeans_model = KMeans( n_clusters = k, n_jobs= 4 , random_state= 123 )
fit_kmeans = kmeans_model. fit( data_means_stander)
data_means[ 'count' ] = data_means[ 'income_risk' ] + data_means[ 'economic_risk' ] + data_means. loc[ : , 'history_credit_risk' ]
sort_values= data_means. sort_values( "count" , inplace= False )
kmeans_model. cluster_centers_
data_means[ 'lable' ] = kmeans_model. labels_
r1 = pd. Series( kmeans_model. labels_) . value_counts( )
data_means. to_csv( "mean.csv" , index= False , sep= ',' , encoding= "utf_8_sig" )