处理银行的数据保留的程序
# -*- coding: utf-8 -*-
"""
Created on Mon Jul 03 22:21:37 2017
@author: Administrator
"""
import pandas as pd
import numpy as np
from numpy import *
import datetime
import matplotlib.pyplot as plt
data=pd.read_csv('loan.csv', header = 0)
#sampledata=data.loc[0:700000]
sampledata=data
# 仿真的一些初始化参数
interest_rate=sampledata['int_rate']
debt_ability=sampledata['dti']
user_grade=sampledata['grade']
user_subgrade=sampledata['sub_grade']
loan_term=sampledata['term']
funded_amount=sampledata['funded_amnt']
'''
## 看看用户对自报月收入的评估
user_grade_annual_inc=sampledata[['sub_grade','annual_inc']]
##对自报月收入的异常数据的处理
str_annual_true=[]
k=0
for i in range(user_grade_annual_inc.shape[0]):
print i
str_annual=user_grade_annual_inc.iloc[i,1]
if isinstance(str_annual, str)==1:
str_annual_true.append(i)
#删除掉那些异样的数据
user_grade_annual_inc=user_grade_annual_inc.drop(str_annual_true)
alphabet=['A','B','C','D','E','F','G']
user_grade_average_annual_inc=zeros([len(alphabet)*5,1])
## 循环grade级别的用户
z=0
for i in range(len(alphabet)):
print i
#i=1
# 循环subgrade级别的用户
for j in range(5):
#j=1
alphabet_grade_subgrade=alphabet[i]+str(j+1)
type_index=user_grade_annual_inc['sub_grade'] == alphabet_grade_subgrade
user_grade_annual_inc_type=user_grade_annual_inc[type_index]
user_grade_annual_inc_type=user_grade_annual_inc_type.drop('sub_grade',1)
#用矩阵的形式展现
user_grade_average_annual_inc[z,0]=user_grade_annual_inc_type['annual_inc'].mean()
z=z+1
# 用户等级跟它自己的评估能力关系
plt.xlabel('subgrade')
plt.ylabel('annualinc')
plt.plot(user_grade_average_annual_inc,color="blue", linewidth=2.5, linestyle="-", label="ability")
'''
'''
## 看看用户对于自己的借款能力的评估(只针对A,B,C....等级)
user_grade_ability=sampledata[['grade','sub_grade','dti']]
user_delay_payment=sampledata[['sub_grade','delinq_2yrs']]
##对延迟还款次数的异常数据的处理
str_delay_true=[]
str_delay_index=zeros([user_delay_payment.shape[0],1])
k=0
for i in range(user_delay_payment.shape[0]):
str_delay=user_delay_payment.iloc[i,1]
if isinstance(str_delay, str)==1:
str_delay_true.append(i)
str_delay_index[i,0]=isinstance(str_delay, str)
#删除掉那些异样的数据
user_delay_payment=user_delay_payment.drop(str_delay_true)
alphabet=['A','B','C','D','E','F','G']
user_grade_average_value={}
user_grade_average_value1=zeros([len(alphabet)*5,1])
user_delay_payment_average_value=zeros([len(alphabet)*5,1])
#user_grade_average_value2=[len(alphabet)*4]
## 循环grade级别的用户
z=0
for i in range(len(alphabet)):
print i
#i=1
# 循环subgrade级别的用户
for j in range(5):
#j=1
alphabet_grade_subgrade=alphabet[i]+str(j+1)
type_index= user_grade_ability['sub_grade'] == alphabet_grade_subgrade
user_grade_ability_type=user_grade_ability[type_index]
user_delay_payment_type=user_delay_payment[type_index]
user_delay_payment_type=user_delay_payment_type.drop('sub_grade',1)
user_grade_ability_type=user_grade_ability_type.drop('grade',1)
user_grade_ability_type=user_grade_ability_type.drop('sub_grade',1)
#这是用词典的形式展示的
user_grade_average_value[alphabet_grade_subgrade]=user_grade_ability_type['dti'].mean() #均值
#用矩阵的形式展现
user_grade_average_value1[z,0]=user_grade_ability_type['dti'].mean()
#对于用户的延迟还款的次数
user_delay_payment_average_value[z,0]=user_delay_payment_type['delinq_2yrs'].mean()
z=z+1
# 用户等级跟它自己的评估能力关系
plt.xlabel('subgrade')
plt.ylabel('ability')
plt.plot(1/user_grade_average_value1,color="blue", linewidth=2.5, linestyle="-", label="ability")
#不良记录
plt.xlabel('subgrade')
plt.ylabel('delay payment')
plt.plot(user_delay_payment_average_value,color="red", linewidth=2.5, linestyle="-")
'''
'''
## 看看用户对于自己的借款能力的评估
user_subgrade_ability=sampledata[['sub_grade','dti']]
user_subgrade_ability=user_subgrade_ability.sort_values(by=['sub_grade'])
user_subgrade_ability=user_subgrade_ability.drop_duplicates(['sub_grade'])
user_subgrade_ability=user_subgrade_ability.reset_index()
user_subgrade_ability=user_subgrade_ability.drop('index',1)
# 重新构建index
user_subgrade_ability=user_subgrade_ability.set_index('sub_grade')
user_subgrade_ability.plot(kind='bar')
'''
## 看看一个用户的等级会不会变
#one_user_data=sampledata[sampledata.id==1060578]
'''
user_number=sampledata['member_id'].value_counts()#统计每个等级的个数
user_subgrade_amount_term_rate=sampledata[['sub_grade','loan_amnt','term','int_rate']]
'''
## 看看利息的高低与金钱的数量的关系
'''
## 等级越来越低的时候,借贷的利率如何变化
##结论就是等级越低利率就越高
user_subgrade_rate=sampledata[['sub_grade','int_rate']]
user_subgrade_rate=user_subgrade_rate.drop_duplicates(['sub_grade'])
user_subgrade_rate=user_subgrade_rate.sort_values(by=['sub_grade'])
user_subgrade_rate=user_subgrade_rate.reset_index()
user_subgrade_rate=user_subgrade_rate.drop('index',1)
#user_subgrade_rate=user_subgrade_rate.drop('sub_grade',1)
user_subgrade_rate=user_subgrade_rate.set_index('sub_grade')
user_subgrade_rate.plot()
'''
'''
#user_subgrade_amount_term_rate=user_subgrade_amount_term_rate.sort_values(by=['sub_grade'])
user_subgrade_number=user_subgrade_amount_term_rate['sub_grade'].value_counts()#统计每个子等级的个数
user_subgrade_number=user_subgrade_number.reset_index()
user_subgrade_number.rename(columns={'index':'subgrade'}, inplace = True)
user_subgrade_number.rename(columns={'sub_grade':'subgrade_number'}, inplace = True)
user_rate_number=user_subgrade_amount_term_rate['int_rate'].value_counts()#统计每个利率的个数
user_rate_number=user_rate_number.reset_index()
user_rate_number.rename(columns={'index':'rate'}, inplace = True)
user_rate_number.rename(columns={'int_rate':'int_rate_number'}, inplace = True)
user_subgrade_rate = pd.concat([user_subgrade_number,user_rate_number], axis=1)
user_subgrade_rate=user_subgrade_rate.sort_values(by=['subgrade'])
'''
'''
#统计下子等级的个数
## 得到的结论就是服从某个分布
subgrade_number=user_subgrade.value_counts()#统计每个等级的个数
sbugrade_number=subgrade_number.reset_index()
# 修改列的名字
sbugrade_number.rename(columns={'sub_grade':'number'}, inplace = True)
sbugrade_number.rename(columns={'index':'subgrade'}, inplace = True)
#把等级字母按顺序排序
sbugrade_number=sbugrade_number.sort_values(by=['subgrade'])
sbugrade_number=sbugrade_number.reset_index()
sbugrade_number=sbugrade_number.drop('index',1)
sbugrade_number=sbugrade_number.set_index('subgrade')
#sbugrade_number2=sbugrade_number1[['subgrade','number']]
sbugrade_number.plot(kind='bar')
'''
#统计下grade的个数是多少
grade_number=user_grade.value_counts()#统计每个等级的个数
grade_number=grade_number.reset_index()
# 修改列的名字
grade_number.rename(columns={'grade':'number'}, inplace = True)
grade_number.rename(columns={'index':'grade'}, inplace = True)
#把等级字母按顺序排序
grade_number=grade_number.sort_values(by=['grade'])
#grade_number=grade_number.reset_index()
grade_number=grade_number.set_index('grade')
#grade_number= grade_number.drop('index', 1)
grade_number.plot(kind='bar')