import os
import pandas as pd
os.chdir(r'H:\2019-2-3新华书店笔记以及资料\资料\HW5')
card=pd.read_csv(r"card.csv",encoding="gbk") #读取数据
disp=pd.read_csv(r"disp.csv",encoding="gbk")
clients=pd.read_csv(r"clients.csv",encoding="gbk")
card.head() #查看数据分布
card_id | disp_id | issued | type | |
---|---|---|---|---|
0 | 1005 | 9285 | 1993-11-07 | 普通卡 |
1 | 104 | 588 | 1994-01-19 | 普通卡 |
2 | 747 | 4915 | 1994-02-05 | 普通卡 |
3 | 70 | 439 | 1994-02-08 | 普通卡 |
4 | 577 | 3687 | 1994-02-15 | 普通卡 |
disp.head()
disp_id | client_id | account_id | type | |
---|---|---|---|---|
0 | 1 | 1 | 1 | 所有者 |
1 | 2 | 2 | 2 | 所有者 |
2 | 3 | 3 | 2 | 用户 |
3 | 4 | 4 | 3 | 所有者 |
4 | 5 | 5 | 3 | 用户 |
clients.head()
client_id | sex | birth_date | district_id | |
---|---|---|---|---|
0 | 1 | 女 | 1970-12-13 | 18 |
1 | 2 | 男 | 1945-02-04 | 1 |
2 | 3 | 女 | 1940-10-09 | 1 |
3 | 4 | 男 | 1956-12-01 | 5 |
4 | 5 | 女 | 1960-07-03 | 5 |
import sqlite3 # sqlite3相当于轻量版,更多功能可使用SQLAlchemy
con = sqlite3.connect(':memory:') # 数据库连接,使用sql连接查询表
card.to_sql('card', con)
disp.to_sql('disp', con)
clients.to_sql('clients', con)
car_sql='''select a.*,c.sex,c.birth_date,c.district_id from card a left join disp b on a.disp_id=b.disp_id left join clients c on b.client_id=c.client_id where b.type="所有者"'''
card_t=pd.read_sql(car_sql, con)
card_t.head()
index | card_id | disp_id | issued | type | sex | birth_date | district_id | |
---|---|---|---|---|---|---|---|---|
0 | 0 | 1005 | 9285 | 1993-11-07 | 普通卡 | 女 | 1968-01-28 | 74 |
1 | 1 | 104 | 588 | 1994-01-19 | 普通卡 | 女 | 1960-10-20 | 61 |
2 | 2 | 747 | 4915 | 1994-02-05 | 普通卡 | 男 | 1963-07-19 | 40 |
3 | 3 | 70 | 439 | 1994-02-08 | 普通卡 | 女 | 1968-09-12 | 51 |
4 | 4 | 577 | 3687 | 1994-02-15 | 普通卡 | 男 | 1972-02-06 | 49 |
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体
mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
card_t['type'].value_counts().plot(kind='pie',autopct='%.1f%%') #查看数据分布 饼图
<matplotlib.axes._subplots.AxesSubplot at 0x167ec390>
#1、发卡趋势 #总体发卡趋势 from datetime import * card_t['issued_date']=pd.to_datetime(card_t['issued']) card_t['issued_year']=card_t['issued_date'].map(lambda x:x.year) card_t.card_id.groupby(card_t['issued_year']).count().plot(kind="bar") #年龄和发卡的关系 <matplotlib.axes._subplots.AxesSubplot at 0x9354358>
#2、不同卡的分布
card_t.type.value_counts().plot(kind="pie",autopct='%.1f%%') #卡种分布
<matplotlib.axes._subplots.AxesSubplot at 0x93aa8d0>
pd.crosstab(card_t.issued_year,card_t.type).plot(kind = 'bar') #卡种和年龄的关系分布 <matplotlib.axes._subplots.AxesSubplot at 0x940cb00>
t1=pd.crosstab(card_t.issued_year,card_t.type) #卡种和年龄占比对比
t1["sum1"]=t1.sum(1)
t2=t1.div(t1.sum1,axis = 0)
t2.drop("sum1",1).plot(kind = 'bar',stacked= True)
<matplotlib.axes._subplots.AxesSubplot at 0xa8b5d30>
import matplotlib.pyplot as plt labels=["青年卡","普通卡","金卡"] y1=t1.loc[:,"青年卡"]#.astype('int') y2=t1.loc[:,"普通卡"]#.astype('int') y3=t1.loc[:,"金卡"]#.astype('int') x=t1.index#.astype('int') plt.stackplot(x,y1,y2,y3,labels = labels) # 图表类型 plt.title('发卡趋势') plt.ylabel('发卡量') plt.legend(loc = 'upper left')# 标签位置 plt.show()
?plt.stackplot(x,y1,y2,y3,labels = labels) pd.crosstab(card_t.type,card_t.sex).plot(kind='bar')#性别和卡种数量对比 <matplotlib.axes._subplots.AxesSubplot at 0xa9960b8>
xb=pd.crosstab(card_t.type,card_t.sex) #性别和卡种百分比对比 xb.div(xb.sum(1),axis=0).plot(kind='bar') <matplotlib.axes._subplots.AxesSubplot at 0xaa24860>
from stack2dim import * stack2dim(card_t,'type','sex') #调用之前写的库,查看卡种和性别关系(柱子粗细代表数量,纵坐标代表百分比)
card_t['age']=(pd.to_datetime(card_t['issued'])-pd.to_datetime(card_t['birth_date'])) card_t['age1']=card_t['age'].map(lambda x:x.days/365) import seaborn as sns sns.boxplot(x='type',y='age1',data=card_t) #查看何须图,年龄和卡种分布 <matplotlib.axes._subplots.AxesSubplot at 0xc104940>
trans=pd.read_csv(r"trans.csv",encoding="gbk") trans.to_sql('trans', con) card_t.to_sql('card_t', con) D:\anaconda\lib\site-packages\IPython\core\interactiveshell.py:2785: DtypeWarning: Columns (8) have mixed types. Specify dtype option on import or set low_memory=False. interactivity=interactivity, compiler=compiler, result=result) D:\anaconda\lib\site-packages\pandas\io\sql.py:450: UserWarning: the 'timedelta' type is not supported, and will be written as integer values (ns frequency) to the database. chunksize=chunksize, dtype=dtype)
trans.head()
trans_id | account_id | date | type | operation | amount | balance | k_symbol | bank | account | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 695247 | 2378 | 1993-01-01 | 贷 | 信贷资金 | $700 | $700 | NaN | NaN | NaN |
1 | 171812 | 576 | 1993-01-01 | 贷 | 信贷资金 | $900 | $900 | NaN | NaN | NaN |
2 | 207264 | 704 | 1993-01-01 | 贷 | 信贷资金 | $1,000 | $1,000 | NaN | NaN | NaN |
3 | 1117247 | 3818 | 1993-01-01 | 贷 | 信贷资金 | $600 | $600 | NaN | NaN | NaN |
4 | 579373 | 1972 | 1993-01-02 | 贷 | 信贷资金 | $400 | $400 | NaN | NaN | NaN |
card_t.head()
index | card_id | disp_id | issued | type | sex | birth_date | district_id | issued_date | issued_year | age | age1 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1005 | 9285 | 1993-11-07 | 普通卡 | 女 | 1968-01-28 | 74 | 1993-11-07 | 1993 | 9415 days | 25.794521 |
1 | 1 | 104 | 588 | 1994-01-19 | 普通卡 | 女 | 1960-10-20 | 61 | 1994-01-19 | 1994 | 12144 days | 33.271233 |
2 | 2 | 747 | 4915 | 1994-02-05 | 普通卡 | 男 | 1963-07-19 | 40 | 1994-02-05 | 1994 | 11159 days | 30.572603 |
3 | 3 | 70 | 439 | 1994-02-08 | 普通卡 | 女 | 1968-09-12 | 51 | 1994-02-08 | 1994 | 9280 days | 25.424658 |
4 | 4 | 577 | 3687 | 1994-02-15 | 普通卡 | 男 | 1972-02-06 | 49 | 1994-02-15 | 1994 | 8045 days | 22.041096 |
car_sql='''
select a.card_id,a.issued,a.type,c.type as t_type,c.amount,c.balance,c.date as t_date
from card as a
left join disp as b on a.disp_id=b.disp_id
left join trans as c on b.account_id=c.account_id
where b.type="所有者"
order by a.card_id,c.date
''' #使用sql连接表 card_t2=pd.read_sql(car_sql, con) card_t2.head()
card_id | issued | type | t_type | amount | balance | t_date | |
---|---|---|---|---|---|---|---|
0 | 1 | 1998-10-16 | 金卡 | 贷 | $900 | $900 | 1996-11-24 |
1 | 1 | 1998-10-16 | 金卡 | 贷 | $33975 | $34875 | 1996-12-07 |
2 | 1 | 1998-10-16 | 金卡 | 借 | $6,600 | $28275 | 1996-12-24 |
3 | 1 | 1998-10-16 | 金卡 | 贷 | $30 | $28305 | 1996-12-31 |
4 | 1 | 1998-10-16 | 金卡 | 贷 | $22650 | $50955 | 1997-01-07 |
card_t2['issued']=pd.to_datetime(card_t2['issued']) card_t2['t_date']=pd.to_datetime(card_t2['t_date']) card_t2.head()
card_id | issued | type | t_type | amount | balance | t_date | |
---|---|---|---|---|---|---|---|
0 | 1 | 1998-10-16 | 金卡 | 贷 | $900 | $900 | 1996-11-24 |
1 | 1 | 1998-10-16 | 金卡 | 贷 | $33975 | $34875 | 1996-12-07 |
2 | 1 | 1998-10-16 | 金卡 | 借 | $6,600 | $28275 | 1996-12-24 |
3 | 1 | 1998-10-16 | 金卡 | 贷 | $30 | $28305 | 1996-12-31 |
4 | 1 | 1998-10-16 | 金卡 | 贷 | $22650 | $50955 | 1997-01-07 |
card_t2['balance2'] = card_t2['balance'].map(lambda x: int(''.join(x[1:].split(',')))) card_t2['amount2'] = card_t2['amount'].map(lambda x: int(''.join(x[1:].split(',')))) import datetime #办卡前一年的平均余额 card_t3 = card_t2[card_t2.issued>card_t2.t_date][card_t2.issued<card_t2.t_date+datetime.timedelta(days=365)] card_t3.head() D:\anaconda\lib\site-packages\ipykernel_launcher.py:2: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
card_id | issued | type | t_type | amount | balance | t_date | balance2 | amount2 | |
---|---|---|---|---|---|---|---|---|---|
47 | 1 | 1998-10-16 | 金卡 | 借 | $13600 | $57950 | 1997-10-20 | 57950 | 13600 |
48 | 1 | 1998-10-16 | 金卡 | 借 | $15 | $58197 | 1997-10-31 | 58197 | 15 |
49 | 1 | 1998-10-16 | 金卡 | 贷 | $262 | $58212 | 1997-10-31 | 58212 | 262 |
50 | 1 | 1998-10-16 | 金卡 | 借 | $4,880 | $53317 | 1997-11-06 | 53317 | 4880 |
51 | 1 | 1998-10-16 | 金卡 | 贷 | $22650 | $75967 | 1997-11-07 | 75967 | 22650 |
#card_t3["avg_balance"] = card_t3.groupby('card_id')['balance2'].mean() #生成的series 下面agg生成的dataframe card_t4=card_t3.groupby(['type','card_id'])['balance2'].agg([('avg_balance','mean')]) card_t4.head() #card_t4.to_sql('card_t4', con)
avg_balance | ||
---|---|---|
type | card_id | |
普通卡 | 2 | 38898.610169 |
4 | 54222.060976 | |
7 | 46888.702381 | |
9 | 55854.223881 | |
10 | 51387.500000 |
card_t3.groupby(['type','card_id'])['balance2'].agg([('avg_balance','mean'),'count']).head()
avg_balance | count | ||
---|---|---|---|
type | card_id | ||
普通卡 | 2 | 38898.610169 | 59 |
4 | 54222.060976 | 82 | |
7 | 46888.702381 | 84 | |
9 | 55854.223881 | 67 | |
10 | 51387.500000 | 66 |
card_t5=card_t4.reset_index() #为透视表添加行的名,type类型不变 #card_t5=pd.read_sql('select * from card_t4', con) sns.boxplot(x = 'type', y = 'avg_balance', data = card_t5) <matplotlib.axes._subplots.AxesSubplot at 0x21afb358>
#%%6、不同类型卡的持卡人在办卡前一年内的平均收入对比 type_dict = {'借':'out','贷':'income'} card_t3['type1'] = card_t3.t_type.map(type_dict) card_t6= card_t3.groupby(['type','card_id','type1'])[['amount2']].sum() card_t6.head() card_t6.to_sql('card_t6', con) card_t7=card_t6.reset_index() #card_t7=pd.read_sql('select * from card_t6', con) card_t7.to_sql('card_t7', con) card_t8=pd.read_sql('select * from card_t7 where type1="income"', con) card_t6.head()
amount2 | |||
---|---|---|---|
type | card_id | type1 | |
普通卡 | 2 | income | 193911 |
out | 196384 | ||
4 | income | 474142 | |
out | 357224 | ||
7 | income | 299797 |
sns.boxplot(x = 'type', y = 'amount2', data = card_t8) #收入和卡种的关系 <matplotlib.axes._subplots.AxesSubplot at 0x15fd6588>
card_t8.head()
index | type | card_id | type1 | amount2 | |
---|---|---|---|---|---|
0 | 0 | 普通卡 | 2 | income | 193911 |
1 | 2 | 普通卡 | 4 | income | 474142 |
2 | 4 | 普通卡 | 7 | income | 299797 |
3 | 6 | 普通卡 | 9 | income | 322548 |
4 | 8 | 普通卡 | 10 | income | 644560 |
card_t9=pd.read_sql('select * from card_t7 where type1="out"', con) #支出和卡种的关系 sns.boxplot(x = 'type', y = 'amount2', data = card_t9) <matplotlib.axes._subplots.AxesSubplot at 0xa993a58>