import pandas as pda import numpy as np import matplotlib.pyplot as plt import itertools import missingno data=pda.read_csv("creditcard.csv") # print(data.head()) count_class=pda.value_counts(data.Class) # print(count_class) #查看样本标签分布情况,样本不均衡 # missingno.bar(data,labels=True,color='b') # plt.show() #无缺失值 #根据样本标签种类绘制图形 count_class.plot(kind="bar") plt.xlabel("kind") plt.ylabel("count") # plt.show() #Amount金额太大,标准化处理 amount=data.Amount.values from sklearn.preprocessing import StandardScaler amount=StandardScaler().fit_transform(amount.reshape(-1,1)) data["normAmount"]=amount #删除无用的数据 data=data.drop(["Amount","Time"],axis=1) # print(data.head()) #样本不均衡采样处理方法一、下采样,采用和种类小的一样多的样本 X=data.ix[:,data.columns!="Class"] y=data.ix[:,data.columns=="Class"] oneindex=data[data.Class==1].index zeroindex=data[data.Class==0].index selzeroindex=np.random.choice(zeroindex,len(oneindex),replace=False)#随机选取和标签1一样的index totalindex=np.concatenate([oneindex,selzeroindex]) undersample=data.loc[totalindex] # print(pda.value_counts(undersample.Class)) X_undersample=undersample.ix[:,undersample.columns!="Class"] y_undersample=undersample.ix[:,undersample.columns=="Class"] # print(X_undersample) # print(y_undersample) from sklearn.cross_validation import train_test_split # from sklearn.model_selection import train_test_split