信用卡客户画像（年龄，性别，收入，支出和信用卡种的关系）

最新推荐文章于 2024-05-12 23:02:52 发布

tantao666

最新推荐文章于 2024-05-12 23:02:52 发布

阅读量5.3k

点赞数 5

分类专栏： python

本文链接：https://blog.csdn.net/qq_29663489/article/details/88394325

版权

python 专栏收录该内容

39 篇文章 0 订阅

订阅专栏

import os
import pandas as pd

os.chdir(r'H:\2019-2-3新华书店笔记以及资料\资料\HW5')

card=pd.read_csv(r"card.csv",encoding="gbk") #读取数据
disp=pd.read_csv(r"disp.csv",encoding="gbk")
clients=pd.read_csv(r"clients.csv",encoding="gbk")

card.head() #查看数据分布

	card_id	disp_id	issued	type
0	1005	9285	1993-11-07	普通卡
1	104	588	1994-01-19	普通卡
2	747	4915	1994-02-05	普通卡
3	70	439	1994-02-08	普通卡
4	577	3687	1994-02-15	普通卡

disp.head()

	disp_id	client_id	account_id	type
0	1	1	1	所有者
1	2	2	2	所有者
2	3	3	2	用户
3	4	4	3	所有者
4	5	5	3	用户

clients.head()

	client_id	sex	birth_date	district_id
0	1	女	1970-12-13	18
1	2	男	1945-02-04	1
2	3	女	1940-10-09	1
3	4	男	1956-12-01	5
4	5	女	1960-07-03	5

import sqlite3 # sqlite3相当于轻量版，更多功能可使用SQLAlchemy

con = sqlite3.connect(':memory:') # 数据库连接，使用sql连接查询表

card.to_sql('card', con)

disp.to_sql('disp', con)

clients.to_sql('clients', con)

car_sql='''select a.*,c.sex,c.birth_date,c.district_id from card a left join disp b on a.disp_id=b.disp_id left join clients c on b.client_id=c.client_id where b.type="所有者"'''

card_t=pd.read_sql(car_sql, con)

card_t.head()

	index	card_id	disp_id	issued	type	sex	birth_date	district_id
0	0	1005	9285	1993-11-07	普通卡	女	1968-01-28	74
1	1	104	588	1994-01-19	普通卡	女	1960-10-20	61
2	2	747	4915	1994-02-05	普通卡	男	1963-07-19	40
3	3	70	439	1994-02-08	普通卡	女	1968-09-12	51
4	4	577	3687	1994-02-15	普通卡	男	1972-02-06	49

from pylab import mpl

mpl.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体

mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题

card_t['type'].value_counts().plot(kind='pie',autopct='%.1f%%') #查看数据分布饼图

<matplotlib.axes._subplots.AxesSubplot at 0x167ec390>

#1、发卡趋势
#总体发卡趋势
from  datetime  import  *  
card_t['issued_date']=pd.to_datetime(card_t['issued'])
card_t['issued_year']=card_t['issued_date'].map(lambda x:x.year)
card_t.card_id.groupby(card_t['issued_year']).count().plot(kind="bar") #年龄和发卡的关系
<matplotlib.axes._subplots.AxesSubplot at 0x9354358>

#2、不同卡的分布

card_t.type.value_counts().plot(kind="pie",autopct='%.1f%%') #卡种分布

<matplotlib.axes._subplots.AxesSubplot at 0x93aa8d0>

pd.crosstab(card_t.issued_year,card_t.type).plot(kind = 'bar') #卡种和年龄的关系分布
<matplotlib.axes._subplots.AxesSubplot at 0x940cb00>

t1=pd.crosstab(card_t.issued_year,card_t.type) #卡种和年龄占比对比

t1["sum1"]=t1.sum(1)

t2=t1.div(t1.sum1,axis = 0)

t2.drop("sum1",1).plot(kind = 'bar',stacked= True)

<matplotlib.axes._subplots.AxesSubplot at 0xa8b5d30>

import matplotlib.pyplot as plt
labels=["青年卡","普通卡","金卡"]
y1=t1.loc[:,"青年卡"]#.astype('int')
y2=t1.loc[:,"普通卡"]#.astype('int')
y3=t1.loc[:,"金卡"]#.astype('int')
x=t1.index#.astype('int')
plt.stackplot(x,y1,y2,y3,labels = labels) # 图表类型
plt.title('发卡趋势')
plt.ylabel('发卡量')
plt.legend(loc = 'upper left')# 标签位置
plt.show()

?plt.stackplot(x,y1,y2,y3,labels = labels) 
pd.crosstab(card_t.type,card_t.sex).plot(kind='bar')#性别和卡种数量对比
<matplotlib.axes._subplots.AxesSubplot at 0xa9960b8>

xb=pd.crosstab(card_t.type,card_t.sex) #性别和卡种百分比对比
xb.div(xb.sum(1),axis=0).plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0xaa24860>

from stack2dim import *
stack2dim(card_t,'type','sex') #调用之前写的库，查看卡种和性别关系（柱子粗细代表数量，纵坐标代表百分比）

card_t['age']=(pd.to_datetime(card_t['issued'])-pd.to_datetime(card_t['birth_date']))
card_t['age1']=card_t['age'].map(lambda x:x.days/365)
import seaborn as sns
sns.boxplot(x='type',y='age1',data=card_t) #查看何须图，年龄和卡种分布
<matplotlib.axes._subplots.AxesSubplot at 0xc104940>

trans=pd.read_csv(r"trans.csv",encoding="gbk")
trans.to_sql('trans', con)
card_t.to_sql('card_t', con)
D:\anaconda\lib\site-packages\IPython\core\interactiveshell.py:2785: DtypeWarning: Columns (8) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
D:\anaconda\lib\site-packages\pandas\io\sql.py:450: UserWarning: the 'timedelta' type is not supported, and will be written as integer values (ns frequency) to the database.
  chunksize=chunksize, dtype=dtype)

trans.head()

	trans_id	account_id	date	type	operation	amount	balance	k_symbol	bank	account
0	695247	2378	1993-01-01	贷	信贷资金	$700	$700	NaN	NaN	NaN
1	171812	576	1993-01-01	贷	信贷资金	$900	$900	NaN	NaN	NaN
2	207264	704	1993-01-01	贷	信贷资金	$1,000	$1,000	NaN	NaN	NaN
3	1117247	3818	1993-01-01	贷	信贷资金	$600	$600	NaN	NaN	NaN
4	579373	1972	1993-01-02	贷	信贷资金	$400	$400	NaN	NaN	NaN

card_t.head()

	index	card_id	disp_id	issued	type	sex	birth_date	district_id	issued_date	issued_year	age	age1
0	0	1005	9285	1993-11-07	普通卡	女	1968-01-28	74	1993-11-07	1993	9415 days	25.794521
1	1	104	588	1994-01-19	普通卡	女	1960-10-20	61	1994-01-19	1994	12144 days	33.271233
2	2	747	4915	1994-02-05	普通卡	男	1963-07-19	40	1994-02-05	1994	11159 days	30.572603
3	3	70	439	1994-02-08	普通卡	女	1968-09-12	51	1994-02-08	1994	9280 days	25.424658
4	4	577	3687	1994-02-15	普通卡	男	1972-02-06	49	1994-02-15	1994	8045 days	22.041096

car_sql='''

select a.card_id,a.issued,a.type,c.type as t_type,c.amount,c.balance,c.date as t_date

  from card as a

  left join disp as b on a.disp_id=b.disp_id

  left join trans as c on b.account_id=c.account_id

  where b.type="所有者"

  order by a.card_id,c.date

'''  #使用sql连接表
card_t2=pd.read_sql(car_sql, con)
card_t2.head()

	card_id	issued	type	t_type	amount	balance	t_date
0	1	1998-10-16	金卡	贷	$900	$900	1996-11-24
1	1	1998-10-16	金卡	贷	$33975	$34875	1996-12-07
2	1	1998-10-16	金卡	借	$6,600	$28275	1996-12-24
3	1	1998-10-16	金卡	贷	$30	$28305	1996-12-31
4	1	1998-10-16	金卡	贷	$22650	$50955	1997-01-07

card_t2['issued']=pd.to_datetime(card_t2['issued'])
card_t2['t_date']=pd.to_datetime(card_t2['t_date'])
card_t2.head()

	card_id	issued	type	t_type	amount	balance	t_date
0	1	1998-10-16	金卡	贷	$900	$900	1996-11-24
1	1	1998-10-16	金卡	贷	$33975	$34875	1996-12-07
2	1	1998-10-16	金卡	借	$6,600	$28275	1996-12-24
3	1	1998-10-16	金卡	贷	$30	$28305	1996-12-31
4	1	1998-10-16	金卡	贷	$22650	$50955	1997-01-07

card_t2['balance2'] = card_t2['balance'].map(lambda x: int(''.join(x[1:].split(','))))
card_t2['amount2'] = card_t2['amount'].map(lambda x: int(''.join(x[1:].split(','))))
import datetime #办卡前一年的平均余额
card_t3 = card_t2[card_t2.issued>card_t2.t_date][card_t2.issued<card_t2.t_date+datetime.timedelta(days=365)]
card_t3.head()
D:\anaconda\lib\site-packages\ipykernel_launcher.py:2: UserWarning: Boolean Series key will be reindexed to match DataFrame index.

	card_id	issued	type	t_type	amount	balance	t_date	balance2	amount2
47	1	1998-10-16	金卡	借	$13600	$57950	1997-10-20	57950	13600
48	1	1998-10-16	金卡	借	$15	$58197	1997-10-31	58197	15
49	1	1998-10-16	金卡	贷	$262	$58212	1997-10-31	58212	262
50	1	1998-10-16	金卡	借	$4,880	$53317	1997-11-06	53317	4880
51	1	1998-10-16	金卡	贷	$22650	$75967	1997-11-07	75967	22650

#card_t3["avg_balance"] = card_t3.groupby('card_id')['balance2'].mean() #生成的series  下面agg生成的dataframe
card_t4=card_t3.groupby(['type','card_id'])['balance2'].agg([('avg_balance','mean')])
card_t4.head()
#card_t4.to_sql('card_t4', con)

		avg_balance
type	card_id
普通卡	2	38898.610169
	4	54222.060976
	7	46888.702381
	9	55854.223881
	10	51387.500000

card_t3.groupby(['type','card_id'])['balance2'].agg([('avg_balance','mean'),'count']).head()

		avg_balance	count
type	card_id
普通卡	2	38898.610169	59
	4	54222.060976	82
	7	46888.702381	84
	9	55854.223881	67
	10	51387.500000	66

card_t5=card_t4.reset_index()  #为透视表添加行的名，type类型不变
#card_t5=pd.read_sql('select * from card_t4', con)
sns.boxplot(x = 'type', y = 'avg_balance', data = card_t5)
<matplotlib.axes._subplots.AxesSubplot at 0x21afb358>

#%%6、不同类型卡的持卡人在办卡前一年内的平均收入对比
type_dict = {'借':'out','贷':'income'}
card_t3['type1'] = card_t3.t_type.map(type_dict)
card_t6= card_t3.groupby(['type','card_id','type1'])[['amount2']].sum()
card_t6.head()
card_t6.to_sql('card_t6', con)
card_t7=card_t6.reset_index()
#card_t7=pd.read_sql('select * from card_t6', con)
card_t7.to_sql('card_t7', con)
card_t8=pd.read_sql('select * from card_t7 where type1="income"', con)
card_t6.head()

			amount2
type	card_id	type1
普通卡	2	income	193911
	2	out	196384
	4	income	474142
	4	out	357224
	7	income	299797

sns.boxplot(x = 'type', y = 'amount2', data = card_t8)  #收入和卡种的关系
<matplotlib.axes._subplots.AxesSubplot at 0x15fd6588>

card_t8.head()

	index	type	card_id	type1	amount2
0	0	普通卡	2	income	193911
1	2	普通卡	4	income	474142
2	4	普通卡	7	income	299797
3	6	普通卡	9	income	322548
4	8	普通卡	10	income	644560

card_t9=pd.read_sql('select * from card_t7 where type1="out"', con) #支出和卡种的关系
sns.boxplot(x = 'type', y = 'amount2', data = card_t9)
<matplotlib.axes._subplots.AxesSubplot at 0xa993a58>