import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import seaborn as sns
data = pd.read_csv('data.csv', encoding ='gb18030')
data.drop('id_name', axis=1, inplace=True)
data.drop(data.columns[[0,1,2]], axis=1, inplace=True)
for colname in data.columns:
if len(set(data[colname])) == 1:
#print(colname)
data.drop(colname, axis=1, inplace=True)
data['reg_preference_for_trad'] = pd.factorize(data['reg_preference_for_trad'])[0].astype(np.uint16)
for i in range(4754):
if pd.isnull(data['student_feature'][i]):
data['student_feature'][i]=0
nowtime = datetime.strptime('2019-08-05 00:00:00','%Y-%m-%d %H:%M:%S')
data_clo_name = ['latest_query_time','loans_latest_time']
for j in range(len(data_clo_name)):
for i in range(4754):
if pd.isnull(data[data_clo_name[j]][i]):
data[data_clo_name[j]][i] = 0
else:
dt1 = datetime.strptime(data[data_clo_name[j]][i],'%Y-%m-%d')
data[data_clo_name[j]][i] = (nowtime-dt1).days
data['latest_query_time'] = data['latest_query_time'].astype(np.float64)
data['loans_latest_time'] = data['loans_latest_time'].astype(np.float64)
for i in range(4754):
if pd.isnull(data['first_transaction_time'][i]):
data['first_transaction_time'][i] = 0
#print(data['first_transaction_time'][i])
else:
dt1 = datetime.strptime(str(int(data['first_transaction_time'][i])),'%Y%m%d')
data['first_transaction_time'][i] = (nowtime-dt1).days
data.fillna(0, inplace=True)
for i,col_name in enumerate(data.columns):
plt.figure(i)
plt.subplot(2, 1, 1)
plt.plot(data[col_name])
plt.subplot(2, 1, 2)
plt.hist(data[col_name])
plt.title(col_name, y=0.5, loc ='left')
plt.show()
dfdate=pd.DataFrame(data,columns=data.columns)
sns.pairplot(dfdate)
plt.tight_layout()
plt.show()