代码如下:
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 6 13:41:19 2019
@author: lyl
"""
import numpy as np
import pandas as pd
afile1 = pd.read_csv('20190801.csv') #(93230, 4926)
afile1 = afile1[['NO_CER', 'TIME_REP']]
afile2 = pd.read_csv('20190802.csv') #(33192, 4926)
afile2 = afile2[['NO_CER', 'TIME_REP']]
#时间戳由string转换成date格式
#time.strftime()一次只能转一条,需要apply()
afile1['TIME_REP'] = afile1['TIME_REP'].apply(lambda x: time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(str(x)[:-3]))))
afile1.dtypes
afile2.dtypes
afile2['TIME_REP'] = afile2['TIME_REP'].apply(lambda x: time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(str(x)[:-3]))))
afile = pd.concat([afile1, afile2], axis = 0)
#concat时index有重复出现
afile.reset_index(drop = True, inplace = True)
afile.to_csv('activateIdRepTime.csv', index = False)
afile = pd.read_csv('activateIdRepTime.csv')
#检查激活人数是否重复
len(afile['NO_CER']) == afile.shape[0] #True, 无重复
afile.shape#行数,列数,[0]表行数
#找到重复值
from collections import Counter
Counter(list(afile['NO_CER']))
atime_rep = dict(Counter(list(afile['NO_CER'])))
print([key for key, value in atime_rep.items() if value > 1]) #[] 空值
print({key : value for key, value in atime_rep.items() if value > 1})#[]空值
#获取按时间排序后的索引顺序
index = afile.sort_values(by = ['TIME_REP'], ascending = True)
#查看时间最大最小值
np.array([np.max(afile['TIME_REP'])]).astype('M8[D]') #array(['2019-01-01'], dtype='datetime64[D]')
np.array([np.min(afile['TIME_REP'])]).astype('M8[D]') #array(['2018-06-15'], dtype='datetime64[D]')
#与提数匹配核对
repay = pd.read_csv('repay.csv', encoding = 'ansi')
repay.rename(columns = {'ID_NO_MD5': '身份证'}, inplace = True)
repay = repay.sort_values(by = ['贷款申请日期', '身份证', '借据号', '期号'], ascending = [True, True, True, True])
np.max(repay['贷款申请日期'].astype('M8[D]')) #Timestamp('2018-12-31 00:00:00')
np.min(repay['贷款申请日期'].astype('M8[D]')) #Timestamp('2018-07-01 00:00:00')
#检查两个数据集身份证号数量是否相同
#repay[repay['期号'] == 0]['ID']
len(set(afile['NO_CER']) - set(repay[repay['期号'] == 0]['身份证'])) #0 数量相同
len(set(repay[repay['期号'] == 0]['身份证']) - set(afile['NO_CER'])) #有3727没有匹配上,还款数据中有些人没有PBOC数据
repaydate = np.array(repay['最后还款日'].fillna('2019-08-06')).astype('M8[D]')
paydate = np.array(repay['到期日']).astype('M8[D]')
pddays = (repaydate - paydate).astype(int)
#np.where嵌套使用判断多重结果
pddays = np.where(pddays < 0, 0, np.where(repay['期号'] == 0, 0, pddays))
repay['checkcpd'] = pddays
repay[['期号', '到期日', '最后还款日', 'checkcpd']]
#检查当前逾期天数是否匹配
#np.sum(bool值为True)
np.sum(repay['当前逾期天数'] != repay['checkcpd']) #0
#检查最后逾期表
arepay = repay.sort_values(by = ['checkcpd'], ascending = [False])
arepay = arepay.drop_duplicates(['身份证'], keep = 'first')
arepay.reset_index(drop = True, inplace = True)
#历史最大逾期天数 缺失值补0
arepay['历史最大逾期天数'].fillna(0, inplace = True)
np.sum(arepay['checkcpd'] != arepay['历史最大逾期天数'])#509个历史最大逾期天数与个人最大逾期天数不同
arepay[arepay['checkcpd'] != arepay['历史最大逾期天数']]#判断为True的显示出来
pd.DataFrame(arepay[arepay['checkcpd'] != arepay['历史最大逾期天数']]['身份证']).to_csv('问题身份证号.csv', index = False)
#合并只有左表,无右表?
finaltable = afile.merge(arepay[['身份证', '贷款申请日期', 'checkcpd', '当前逾期天数', '历史最大逾期天数']],
left_on = 'NO_CER', right_on = '身份证', how = 'inner')
checkrepdate = (np.array(finaltable['TIME_REP']).astype('M8[D]') - np.array(finaltable['贷款申请日期']).astype('M8[D]')).astype(int)
np.min(checkrepdate) #-30
np.max(checkrepdate) #1
np.sum(np.where((checkrepdate < -30) | (checkrepdate > 1), 1, 0)) #0, 没有异常