基于python银行用户客户Kmeans聚类

本项目主要目的:

1、练习特征工程

2、用python实现简单地初始随机选点的kmeans算法,与sklearn的用Kmeans++选点的方法做对比

按照惯例,先上数据和代码

下载点我

本次数据存在的问题:

1、重复数据客户级别的数据有重复记录,细分为2种类型

(1) 多个贷款产品:将属于同一个客户的贷款产品进行合并

(2) 重复提交资料,属于资料更新例: 选择最新的一次资料提交

2、缺失数据

(1)用非缺失的数据代替缺失数据

(2)以下字段有很高比例的缺失,不   纳入建模的范围

    • 教育水平
    • 月收入
    • 职业
    • 行业 

3、数据预处理中的归一化

    数据尺度上的差异性:在聚类分析中,为了影响尺度对距离计算带来的影响,需要做归一化处理 

    • 贷款期限: 60120240etc

    • 贷款合同金额: 10,000~7,000,000,etc

    • 贷款利率: 3.434.1655.39etc

4、数据预处理中的编码

存在(非数值型的)类别型数据

业务种类名称:个人一手住房贷款,个人二手住房贷款……

婚姻状况名称:已婚,未婚,未说明的婚姻状况

城乡标志:城,乡
性别:男,女
用独热编码
(onehot encoding) 

5、特征衍生

  • 贷款产品的个数

  • 贷款产品的种类数

  • 最长、最短、平均贷款期限

  • 共计、平均贷款金额

  • 共计、平均贷款余额

  • 贷款余额比利 =共计、平均贷款余额/共计、平均贷款金额

  • 最小、最大利率 

代码最了详细的注释,如有问题评论交流

#%%
import pandas as pd
import numpy as np
import random
from sklearn import cluster
import matplotlib.pyplot as plt
from functools import reduce
import scipy.cluster.hierarchy as sch
# def oneHotEncoding(df, old_col):
#     new_col_list = df[old_col].unique()
#     for i in new_col_list:
#         i_values = []
#         for j in range(df.shape[0]):
#             if df[old_col][j] == i:
#                 i_value = (j,1)
#                 i_values.append(i_value)
#             else:
#                 i_value = (j,0)
#                 i_values.append(i_value)
#         df[old_col+'_'+str(i)] = [x[1] for x in i_values]
#     df.drop([old_col],1, inplace=True)

def oneHotEncoding(df, old_field):
    distinct_vals = list(set(reduce(lambda x,y: x+y, df[old_field])))
    cnt = len(distinct_vals)
    new_fields = [old_field + '_' + str(i) for i in range(cnt)]
    for i in range(cnt):
        df[new_fields[i]] = 0
        df[new_fields[i]] = df[old_field].map(lambda x: int(distinct_vals[i] in x))
    del df[old_field]
    return 1

def normalization(df,var,method='min-max'):

    x = df[var]
    new_field = var+ '_norm'
    if method =='min-max':
        x_min = min(x)
        x_max = max(x)
        d = x_max - x_min
        df[new_field] = [(i-x_min)/d for i in x]
        del df[var]
        return 1
    elif method == 'zero-score':
        mu = np.mean(x)
        std = np.std(x)
        df[new_field] = [(i - mu)/std for i in x]
        del df[var]
        return 1
    else:
        print('Please specify the normalization method: min-max or zero-score')
        return -1

def makeupMissing(x, replaceVal):
    if np.isnan(x):
        return replaceVal
    else:
        return x

def minkovDist(x,y,p=2):
    if p >=1:
        return (sum((x-y)**p)**(1/p))
    else:
        print('p must be large or equal to 0')
        return -1

def KmeansAlgo(dataset, k):
    N = dataset.shape[0]
    label = [0]*N
    centroidsIndex = random.sample(range(N), k) # 随机选取从N中随机选取k个点作为中心生成簇
    centroids = [dataset[i,] for i in centroidsIndex] # 索引获得中心点的数据
    centroidsChanged = True #开关
    while(centroidsChanged):
        centroidsChanged = False
        for i in range(N):
            # 分别计算每个点到每个中心的距离,返回每个点最近的中心点
            dist_to_cent = [minkovDist(dataset[i,].getA()[0], centroid.getA()[0]) for centroid in centroids]
            label[i] = dist_to_cent.index(min(dist_to_cent))
        # 更新簇的中心点
        for j in range(k): #获取每个簇的中心点数据
            position = [p for p in range(N) if label[p] == j]
            clusterGroup = dataset[position]
            newCents = np.mean(clusterGroup,axis=0)
            # 由于除法存在的计算误差,原中心点与计算出的新中心点的距离小于等于0.00001视为同一个点
            if minkovDist(newCents.getA()[0], centroids[j].getA()[0]) > 0.00001:
                centroidsChanged = True
                centroids[j] = newCents
            else:
                centroidsChanged = False
    cost = 0 # 计算损失函数
    for i in range(N):
        centroid = centroids[label[i]]
        dist_to_cent = minkovDist(dataset[i,].getA()[0], centroid.getA()[0])
        cost += dist_to_cent**2

    cost = cost/N
    return {'group':label, 'centroids':centroids, 'cost':cost}

#%%
# 第一步,数据处理
loan_table = pd.read_csv('data/loan details.csv', header=0, encoding='gb2312')
cust_table = pd.read_csv('data/customer table.csv', header=0, encoding='gb2312')

cust_id = cust_table['CUST_ID'].drop_duplicates().to_frame(name='CUST_ID')
cust_id.columns = ['CUST_ID']
loan_table_cust = cust_id.merge(loan_table, on='CUST_ID', how='inner') #去重复后的cust数据

id_freq = loan_table.groupby(['CUST_ID'])['CUST_ID'].count()
id_freq2 = id_freq.to_dict()
id_freq3 = [k for k ,v in id_freq2.items() if v>1]
id_freq4 = pd.DataFrame({'CUST_ID':id_freq3}) #有1条以上loan数据的客户

'''
一些重复数据的原因是信息重复提交(前几次提交的信息不完善等原因),如果是同一次贷款申请,其中'CUST_ID','Loan_Type','Loan Term',
'Start_Date','End_Date','Loan_Amt','Undisbursed_Amt','Business_Type_Code'是一样的。
'''
dup_records = id_freq4.merge(loan_table, on='CUST_ID', how='inner')[['CUST_ID','Loan_Type','Loan_Term','Start_Date','End_Date',
                                                                    'Loan_Amt','Undisbursed_Amt','Business_Type_Code']]

dup_records2 = dup_records.drop_duplicates() # 去掉完全重复的记录
id_dup = dup_records2.groupby(['CUST_ID'])['CUST_ID'].count().to_dict() # 每个客户的loan数据条数
# 去重后只有一条记录的用户loan数据
id_dup1 = [k for k,v in id_dup.items() if v == 1]
id_dup1_df = pd.DataFrame({'CUST_ID':id_dup1})
drop_dup_1 = pd.merge(id_dup1_df, loan_table, on='CUST_ID', how='left')
drop_dup_1b = drop_dup_1.groupby('CUST_ID').last() # 以上客户的最新一条loan数据

id_all = list(id_freq.index) # 所有不重复CUST_ID列表
id_non_dup = [i for i in id_all if i not in set(drop_dup_1b.index)] #没有重复记录的用户的loan数据
id_non_dup_df = pd.DataFrame({'CUST_ID':id_non_dup})
id_non_dup_df2 = pd.merge(id_non_dup_df, loan_table, on='CUST_ID', how='left') #以上用户的信息

id_loans = pd.concat([id_non_dup_df2, drop_dup_1b]) # 没有重复的客户loan数据
#%%
# 弥补缺失值:Interest_Payment的缺失:单独一个状态9
temp = id_loans.apply(lambda x: int(makeupMissing(x['Interest_Payment'],9)), axis=1) # 作为缺失单独一个状态9
id_loans['Interest_Payment'] = temp
temp = id_loans.apply(lambda x: int(makeupMissing(x['Credit_Level'],0)),1)
id_loans['Credit_Level'] = temp

# 把贷款信息对应到客户信息,如果一个客户有多条贷款记录,用list保存
all_vars = list(id_loans.columns)
all_vars.remove('CUST_ID')
for var in all_vars:
    id_loans[var] = id_loans[var].apply(lambda x: [x])
    id_loans_group = id_loans.groupby('CUST_ID').sum() # 这一步计算量略大
#%%
# 从贷款信息中衍生出新特征,如果有需要做归一化处理
# 每个客户不同的Loan_Type数量
var1 = id_loans_group.apply(lambda x: len(set(x['Loan_Type'])), axis=1)
var1 = var1.to_frame(name='Num_Loan_Types')

# 每个客户的Loan数量
var2 = id_loans_group.apply(lambda x: len(x['Loan_Type']), axis=1)
var2 = var2.to_frame(name='Num_Loans')

# 贷款期限
var3a = id_loans_group.apply(lambda x: max(x['Loan_Term']), axis=1)
var3a = var3a.to_frame(name='Max_Loan_Term')
normalization(var3a, 'Max_Loan_Term')
var3b = id_loans_group.apply(lambda x: min(x['Loan_Term']), axis=1)
var3b = var3b.to_frame(name='Min_Loan_Term')
normalization(var3b, 'Min_Loan_Term')

# 贷款额度
var4a = id_loans_group.apply(lambda x: sum(x['Loan_Amt']), axis=1)
var4a = var4a.to_frame(name='Total_Loan_Amt')
var4b = id_loans_group.apply(lambda x: np.mean(x['Loan_Amt']), axis=1)
var4b = var4b.to_frame(name='Mean_Loan_Amt')

# 未使用贷款额度
var5a = id_loans_group.apply(lambda x: sum(x['Undisbursed_Amt']), axis=1)
var5a = var5a.to_frame(name='Total_Undisbursed_Amt')
var5b = id_loans_group.apply(lambda x: np.mean(x['Undisbursed_Amt']), axis=1)
var5b = var5b.to_frame(name='Mean_Undisbursed_Amt')

# 未使用额度站总额的比例
var6a = pd.concat([var4a,var5a],axis=1)
var6a['Total_Undisbursed_to_Loan'] = var6a.apply(lambda x: (x['Total_Undisbursed_Amt']/x['Total_Loan_Amt']), axis=1)
del var6a['Total_Undisbursed_Amt']
del var6a['Total_Loan_Amt']

var6b = pd.concat([var4b,var5b],axis=1)
var6b['Mean_Undisbursed_to_Loan'] = var6b.apply(lambda x: (x['Mean_Undisbursed_Amt']/x['Mean_Loan_Amt']), axis=1)
del var6b['Mean_Undisbursed_Amt']
del var6b['Mean_Loan_Amt']

normalization(var4a, 'Total_Loan_Amt')
normalization(var4b, 'Mean_Loan_Amt')
normalization(var5a, 'Total_Undisbursed_Amt')
normalization(var5b, 'Mean_Undisbursed_Amt')

# 每个用户的最大、最小利率
var7a = id_loans_group.apply(lambda x: min(x['Interest_Rate']), axis=1)
var7a = var7a.to_frame(name='Min_Interest_Rate')
normalization(var7a,'Min_Interest_Rate')

var7b = id_loans_group.apply(lambda x: max(x['Interest_Rate']), axis=1)
var7b = var7b.to_frame(name='Max_Interest_Rate')
normalization(var7b, 'Max_Interest_Rate')

derived_features = pd.concat([var1,var2,var3a,var3b,var4a,var4b,var5a,var5b,var6a,var6b,var7a,var7b],1)

var_onehot_list = ['Business_Type_Code', 'Repay_Way', 'Interest_Payment', 'Rural',
                   'External_Ind', 'Credit_Level', 'Gender']
for var_onehot in var_onehot_list:
    var_onehot_df = id_loans_group[var_onehot]
    var_onehot_df = var_onehot_df.to_frame(name = var_onehot)
    oneHotEncoding(var_onehot_df, var_onehot)
    derived_features = pd.concat([derived_features, var_onehot_df],1)

#%%
# 数据较大,取前1000个点做聚类测试
M = 1000
dataset = np.matrix(derived_features)[:M,]
cost = []
# 选k:2-6看一下聚类效果
for k in range(2,7):
    result = KmeansAlgo(dataset, k)
    cost.append(result['cost'])

plt.plot(range(2,7), cost)
plt.xlabel('num of clusters')
plt.ylabel('cost of clustering')
plt.title('Elbow method')
plt.show()

#%%
# 从图中可以看出k=3时损失减少率发生变化,故选k=3
result = KmeansAlgo(dataset, 3)
featureCompared = np.matrix(np.zeros((dataset.shape[1])))
for l in range(3):
    groupIndex = [i for i in range(M) if result['group'][i] == l]
    temp = dataset[groupIndex]
    featureMean = np.mean(temp, axis=0)
    featureCompared = np.row_stack((featureCompared, featureMean))
np.delete(featureCompared,0,0)
# 取其中2个特征看下分类效果
cols = ['g', 'b', 'r']
for l in range(3):
    groupIndex = [i for i in range(M) if result['group'][i] == l]
    x1 = dataset[groupIndex,2].getA()
    x2 = dataset[groupIndex,10].getA()
    p=plt.scatter(x1, x2, color = cols[l])
plt.show()
#%%
# disMat = sch.distance.pdist(dataset, 'euclildean')
# Z = sch.linkage(disMat, method='average')
# P = sch.dendrogram(Z)
# cluster = sch.fcluster(Z, t=1, criterion='inconsistent')

# 试一下sklearn的kmeans
kmeans = cluster.KMeans(n_clusters=3, random_state=0)
kmeans.fit(dataset)

cols = ['g', 'b', 'r']
for l in range(3):
    groupIndex = [i for i in range(M) if kmeans.labels_[i] == l]
    x1 = dataset[groupIndex,2].getA()
    x2 = dataset[groupIndex,10].getA()
    p=plt.scatter(x1, x2, color = cols[l])
plt.show()



阅读更多
想对作者说点什么? 我来说一句

没有更多推荐了,返回首页