需要数据分析、风控评分卡等相关数据、代码,请添加qq群:102755159,或留言联系笔者邮件发送!!!
如果对金融风控、机器学习、数据科学、大数据分析等感兴趣的小伙伴,可加微信交流(邮件中备注,我会附上微信号)
。
基于python的p2p 贷后指标全自动日报制作
- author : sunyaowu
- begin_time : 2018年8月27日
- end_time : 2018年9月7日
- 说明 :贷后指标,包括贷放、逾期、清理等内容。
一 项目规划
- 项目整体框架:
- ①数据获取
- ②数据处理及分析
- ③生成结构化汇总数据,并制作excel单页可视化报告
- ④定时发送邮件,并通过微信搜索向定向群体发送报告
- 周期规划:一周左右
- 预期结果:日报可以上线工作
二 项目代码
- 代码模块化
- 函数式编程思想
☆ 导入python包
Created on Mon Aug 27 11:44:35 2018
@贷后全自动日报设计
@bigin_time:2018-08-27
@ end_time:2018-09-07
@author: SunYaowu
"""
import pandas as pd #结构化数据处理
import numpy as np #矩阵数据处理
import pymysql as pms #sql模块
import time
import datetime
from openpyxl import load_workbook #excel文件读写模块
from email.mime.text import MIMEText #邮件模块
from email.mime.multipart import MIMEMultipart
from email.header import Header
import smtplib #传输协议模块
from wxpy import * #微信
import itchat #微信
☆ 定义类
#################################☆.主类#################################
class AfterLoanData(): #贷后日报class
def __init__(self):
pass
二 数据获取
- python 模块:pyecharts ,mysql数据库提取数据
☆ 数据库query数据模块
#################################☆.功能模块#################################
'''数据库取数模块'''
# 一个传入sql导出数据的函数
def sql_query(self,sql):
config = {
'host':'xxxxxxxx',
'port':xxxx,
'db':'pxxxxxn',
'user':'cxxxxx_xxxxxxx_r',
'password':'Ixxxxxxxexxxxx',
'charset':'utxxxxx4',
'cursorclass':pms.cursors.DictCursor,
}
try:
conn = pms.connect(**config)
# 使用 cursor() 方法创建一个游标对象 cursor
with conn.cursor() as cur:
# 使用 execute() 方法执行 SQL
cur.execute(sql)
conn.commit()
# 获取所需要的数据
data = pd.DataFrame(cur.fetchall())
#关闭连接
cur.colse()
except:
conn.rollback()
finally:
conn.close()
return data
三 数据处理
- python 模块:numpy,pandas,数据规整:筛选、填充、合并
- 进件情况:data_jinjian,data_jinjian_time,data_jinjian_city,data_jinjian_age,data_jinjian_gender
- 回款情况:data_huikuan_time,data_huikuan_city,data_huikuan_age,data_huikuan_gender
- 逾期情况:yuqi_liuru,data_yuqi_time
☆ 其他一些需要多次调用的功能模块
'''获取用户地址模块'''
def user_addr(self):
address_ds=pd.read_pickle(_path + '\code.pkl')
address_ds=address_ds.loc[address_ds['coordinate']=='success',]
address_ds['num']=address_ds['num'].astype(str)
address_dict=dict(zip(address_ds['num'],address_ds['city_final']))#zip的作用是压缩元素级对象,生成新的列表
return address_dict
'''合并字段模块'''
def merge_ds(self,ds_list,on_var,how):
for i in range(len(ds_list)):
if i==1:
tmp =pd.merge(ds_list[0],ds_list[1],on=on_var,how=how)
elif i>1:
tmp =pd.merge(tmp,ds_list[i],on=on_var,how=how)
return tmp
'''excel文件读写模块'''
def get_excel(self,indata,sheetname,row_add,col_add):
# 将数据和字段名写入excel的函数
ds1 = indata.copy()
#加载文件
#激活sheet页
#workbook = load_workbook(_path + '\日报数据.xlsx')
worksheet = workbook.get_sheet_by_name(sheetname)
#将数据循环写入excel的每个单元格中
for row in range(ds1.shape[0]):
for col in range(ds1.shape[1]):
worksheet.cell(row = row + row_add, column = col + col_add).value = ds1.values[row][col]
#返回生成的excel
#workbook.save(_path + '\日报数据.xlsx')
'''字符类型转换模块'''
def char2num(self,indata,varlist):
tmp=indata.copy()
for var in varlist:
tmp[var]=pd.to_numeric(tmp[var], errors='coerce')
return tmp
'''获取时间模块'''
def get_time(self):
#今日
nowtime = datetime.datetime.now()
nowtime_str = nowtime.strftime("%Y-%m-%d")
#昨日
yestime = nowtime + datetime.timedelta(days=-1)
yestime_str = yestime.strftime("%Y-%m-%d")
#月初
firstday = datetime.datetime(nowtime.year,nowtime.month,1,0,0,0)
firstday_str = firstday.strftime("%Y-%m-%d")
#本月
thismonth_str = nowtime.strftime("%Y-%m")
#上月今日
last_month = nowtime.month - 1
last_year = nowtime.year
if last_month == 0:
last_month = 12
last_year -= 1
lastmonthday = datetime.datetime(month=last_month, year=last_year, day=nowtime.day)
lastmonthday_str = lastmonthday.strftime("%Y-%m-%d")
#上月
lastmonth_str = lastmonthday.strftime("%Y-%m")
return nowtime,nowtime_str,yestime,yestime_str,firstday,firstday_str,thismonth_str,lastmonthday,lastmonthday_str,lastmonth_str
四 数据分析
- python 模块:pandas,scipy,分组聚合运算,描述性统计分析。
☆ 进件情况统计模块
#################################1.进件信息#################################
'''获取进件数据'''
def data_jinjian(self):
sql = 'select user_id,amount,real_amount,service_fee,interest,time_limit,state,address,coordinate,again,create_time,loan_time from cl_borrow' #借款信息
try:
data_apply = self.sql_query(sql = sql)
data_apply = data_apply[['user_id','amount','real_amount','service_fee','interest','time_limit','again','state','address','coordinate','create_time','loan_time']]
data_apply['pass'] = np.where(data_apply['loan_time'].notnull() == True,1,0)
except:
print('Oh No,something error!')
return data_apply
'''按进件时间统计1'''
def _data_jinjian_time(self,indata):
_tmp = indata.copy()
tmp_grp1 = _tmp.groupby(['apply_time'])['amount'].sum().reset_index().rename(columns = {
'amount':'申请金额'})
tmp_grp2 = _tmp.groupby(['apply_time'])['user_id'].agg(pd.Series.count).reset_index().rename(columns = {
'user_id':'申请笔数'})
tmp_grp3 = _tmp.loc[_tmp['pass'] == 1,].groupby(['apply_time'])['amount'].sum().reset_index().rename(columns = {
'amount':'放款金额'})
tmp_grp4 = _tmp.loc[_tmp['pass'] == 1,].groupby(['apply_time'])['user_id'].agg(pd.Series.count).reset_index().rename(columns = {
'user_id':'放款笔数'})
tmp = self.merge_ds([tmp_grp1,tmp_grp2,tmp_grp3,tmp_grp4],on_var = 'apply_time',how = 'outer')
tmp['通过率%'] = tmp[['放款笔数','申请笔数']].apply(lambda x: '{x:.2f}'.format(x = x['放款笔数']/x['申请笔数']) if x['申请笔数'] >0 else '0.00',axis=1 )
tmp =self.char2num( indata = tmp , varlist = ['通过率%'] )
_tmp2 = indata.sort_values(by = ['user_id','create_time'] )
_tmp2['贷款次数'] = _tmp2.groupby(['user_id']).cumcount()+1
_tmp2 = _tmp2.loc[_tmp2['贷款次数']>1,]
tmp2_grp1 = _tmp2.groupby(['apply_time'])['amount'].sum().reset_index().rename(columns = {
'amount':'复贷-申请金额'})
tmp2_grp2 = _tmp2.groupby(['apply_time'])['user_id'].agg(pd.Series.count).reset_index().rename(columns = {
'user_id':'复贷-申请笔数'})
tmp2_grp3 = _tmp2.loc[_tmp2['pass']==1,].groupby(['apply_time'])['amount'].sum().reset_index().rename(columns = {
'amount':'复贷-放款金额'})
tmp2_grp4 = _tmp2.loc[_tmp2['pass']==1,].groupby(['apply_time'])['user_id'].agg(pd.Series.count).reset_index().rename(columns = {
'user_id':'复贷-放款笔数'})
tmp2 = self.merge_ds([tmp2_grp1,tmp2_grp2,tmp2_grp3,tmp2_grp4 ],on_var = 'apply_time',how = 'outer')
tmp=pd.merge(tmp,tmp2,on = 'apply_time',how = 'outer')
tmp['复贷-通过率%'] = tmp[['复贷-放款笔数','复贷-申请笔数']].apply(lambda x: x['复贷-放款笔数'] / x['复贷-申请笔数'] if x['复贷-申请笔数'] > 0 else 0,axis = 1 )
tmp['复贷通过占比%'] = tmp[['复贷-放款笔数','放款笔数']].apply(lambda x: x['复贷-放款笔数'] / x['放款笔数'] if x['放款笔数'] > 0 else 0,axis = 1 )
tmp = tmp[['apply_time','申请金额','申请笔数','放款金额','放款笔数','通过率%','复贷-申请金额','复贷-申请笔数','复贷-放款金额','复贷-放款笔数','复贷通过占比%','复贷-通过率%']]
return tmp
'''按进件时间统计2'''
def data_jinjian_time(self,indata):
#日
ds1 = indata.copy()
ds1['apply_time'] = ds1['create_time'].apply(lambda x: x.strftime("%Y-%m-%d") )
tmp_1 = self._data_jinjian_time( indata = ds1 )
#为了格式整齐,防止某一天没有数据,补齐所有理论上的放款时间。
format_day = pd.DataFrame({
'apply_time': list(set([(ds1['create_time'].min() + datetime.timedelta(days = diff)).strftime("%Y-%m-%d") for diff in range( int((nowtime - data_apply['create_time'].min()) / np.timedelta64(1,'D')) )])) })
tmp_1 = pd.merge(tmp_1,format_day,on = 'apply_time',how = 'outer').rename(columns = {
'apply_time':'申请时间'})
tmp_1 = tmp_1.fillna(0).sort_values(['申请时间'])
#周
ds2 = indata.copy()
week_cutday=[((nowtime-datetime.timedelta(days=diff ))).strftime("%Y-%m-%d" ) for diff in range(1,8) if (nowtime-datetime.timedelta(days=diff )).weekday() ==0]
ds2['apply_time'] = ds2['create_time'].apply(lambda x: x.strftime("%Y-%m-%d" ) if x >= pd.to_datetime(week_cutday[0]) else np.nan )
tmp_2 = self._data_jinjian_time( indata = ds2 )
#为了格式整齐,防止某一天没有数据,补齐所有理论上的放款时间。
format_week = pd.DataFrame({
'apply_time': [(pd.to_datetime(week_cutday[0]) + datetime.timedelta(days = diff)).strftime("%Y-%m-%d" ) for diff in range(0,7) if pd.to_datetime(week_cutday[0]) + datetime.timedelta(days=diff) <= nowtime ] })
tmp_2 = pd.merge(tmp_2,format_week,on = 'apply_time',how = 'outer').rename(columns = {
'apply_time':'申请时间'})
tmp_2 = tmp_2.fillna(0).sort_values(['申请时间'])
#生成进件数据,打印到excel,用以制作excel报告
self.get_excel(indata = tmp_2,sheetname = 'jietiao_apply_day',row_add = 9,col_add = 1)
#月
ds3 = indata.copy()
ds3['apply_time'] = ds3['create_time'].apply(lambda x: x.strftime("%Y-%m") )
tmp_3 = self._data_jinjian_time( indata=ds3 )
#为了格式整齐,防止某一天没有数据,补齐所有理论上的放款时间。
format_month = pd.DataFrame({
'apply_time': list(set([(ds2['create_time'].min() + datetime.timedelta(days = diff)).strftime("%Y-%m") for diff in range( int((nowtime - data_apply['create_time'].min()) / np.timedelta64(1,'D')) )])) })
tmp_3 = pd.merge(tmp_3,format_month,on = 'apply_time',how = 'outer').rename(columns = {
'apply_time':'申请时间'})
tmp_3 = tmp_3.fillna(0).sort_values(['申请时间'])
#生成进件数据,打印到excel,用以制作excel报告
self.get_excel(indata = tmp_3,sheetname = 'jietiao_apply_day',row_add = 24,col_add = 2)
return tmp_1,tmp_2,tmp_3
def jinjian_dayReport(self,indata):
#日数据
ds1 = indata.copy()
ds1['apply_time'] = ds1['create_time'].apply(lambda x: x.strftime("%Y-%m-%d" ) if (x.strftime("%Y-%m-%d" ) == nowtime_str) or (x.strftime("%Y-%m-%d" ) == yestime_str) or (x.strftime("%Y-%m-%d" ) == lastmonthday_str) else np.nan )
ds1_grp1 = ds1.groupby(['apply_time'])['user_id'].agg(pd.Series.count).reset_index().rename(columns = {
'user_id':'申请笔数'})
ds1_grp2 = ds1.loc[ds1['pass'] == 1,].groupby(['apply_time'])['user_id'].agg(pd.Series.count).reset_index().rename(columns = {
'user_id':'通过笔数'})
ds1_tmp = pd.DataFrame({
'apply_time':[nowtime_str],
'本月申请笔数':[ds1.loc[(ds1['create_time'].apply(lambda x: x.strftime("%Y-%m")) == thismonth_str) & (ds1['create_time'] <= nowtime),'user_id'].shape[0]],
'本月通过笔数':[ds1.loc[(ds1