数据集来自UCI机器学习库(http://archive.ics.uci.edu/ml/datasets/Bank+Marketing),不过下载的csv是揉作一团的。。。又费时间整理好。。
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime as dt
from sklearn import preprocessing #数据预处理
from sklearn.linear_model import LogisticRegression #导入逻辑回归库
from sklearn.model_selection import train_test_split #用来划分测试集与训练集
import seaborn as sns #画图库
import tensorflow as tf
plt.rc("font", size=14)
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)
data = pd.read_csv('D:\\pycode\\data\\bank-additional\\bank-additional-full.csv',header = 0)
data = data.dropna() #去掉带nan的行
print(data.shape)
print(list(data.columns))
col_name = list(data.columns)
col1 = ''.join(col_name)
print(col1)
col2 = col1.split(';')
print(col2)
#print(type(data))
data = np.array(data)
(41188, 1)
['age;"job";"marital";"education";"default";"housing";"loan";"contact";"month";"day_of_week";"duration";"campaign";"pdays";"previous";"poutcome";"emp.var.rate";"cons.price.idx";"cons.conf.idx";"euribor3m";"nr.employed";"y"']
age;"job";"marital";"education";"default";"housing";"loan";"contact";"month";"day_of_week";"duration";"campaign";"pdays";"previous";"poutcome";"emp.var.rate";"cons.price.idx";"cons.conf.idx";"euribor3m";"nr.employed";"y"
['age', '"job"', '"marital"', '"education"', '"default"', '"housing"', '"loan"', '"contact"', '"month"', '"day_of_week"', '"duration"', '"campaign"', '"pdays"', '"previous"', '"poutcome"', '"emp.var.rate"', '"cons.price.idx"', '"cons.conf.idx"', '"euribor3m"', '"nr.employed"', '"y"']
lis = []
col2[0]='"age"'
for i in range(len(col2)):
if (col2[i][0] >= '0' and col2[i][0] <= '9') or col2[i][0]=='-':
if '.' in col2[i]:
lis.append(float(col2[i]))
else:
lis.append(int(col2[i]))
else:
lis.append(col2[i][1:-1])
print(lis)
['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'y']
#print(data[0])
#str = ''.join(data[0]) #array转换成字符串
#print(str)
#tmp = str.split(';')
#print(tmp)
#print(len(tmp))
#print((tmp[0][0]>'0' and tmp[0][0]<'9'))
#print(type(tmp[1]))
#li = []
#for i in range(len(tmp)):
# if (tmp[i][0] >= '0' and tmp[i][0] <= '9') or tmp[i][0]=='-':
# if '.' in tmp[i]:
# li.append(float(tmp[i]))
# else:
# li.append(int(tmp[i]))
# else:
# li.append(tmp[i][1:-1])
#print(li)
[ '56;"housemaid";"married";"basic.4y";"no";"no";"no";"telephone";"may";"mon";261;1;999;0;"nonexistent";1.1;93.994;-36.4;4.857;5191;"no"']
56;"housemaid";"married";"basic.4y";"no";"no";"no";"telephone";"may";"mon";261;1;999;0;"nonexistent";1.1;93.994;-36.4;4.857;5191;"no"
['56', '"housemaid"', '"married"', '"basic.4y"', '"no"', '"no"', '"no"', '"telephone"', '"may"', '"mon"', '261', '1', '999', '0', '"nonexistent"', '1.1', '93.994', '-36.4', '4.857', '5191', '"no"']
21
True
<class 'str'>
[56, 'housemaid', 'married', 'basic.4y', 'no', 'no', 'no', 'telephone', 'may', 'mon', 261, 1, 999, 0, 'nonexistent', 1.1, 93.994, -36.4, 4.857, 5191, 'no']
li = [[] for _ in range(data.shape[0])]
for i in range(data.shape[0]):
str = ''.join(data[i]) #array转换成字符串
tmp = str.split(';')
for j in range(len(tmp)):
if (tmp[j][0] >= '0' and tmp[j][0] <= '9') or tmp[j][0]=='-':
if '.' in tmp[j]:
li[i].append(float(tmp[j]))
else:
li[i].append(int(tmp[j]))
else:
li[i].append(tmp[j][1:-1])
#print(li)
import csv
with open("D:\\pycode\\data\\bank-additional\\mybank.csv","w") as csvfile:
writer = csv.writer(csvfile)
#先写入columns_name
writer.writerow(lis)
#写入多行用writerows
writer.writerows(li)
dataset = pd.read_csv('D:\\pycode\\data\\bank-additional\\mybank.csv',header = 0)
dataset = dataset.dropna()
print(dataset.shape)
print(list(dataset.columns))
(41188, 21)
['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'y']
dataset['education'].unique() #某一列去重
dataset['education']=np.where(dataset['education'] =='basic.9y', 'Basic', dataset['education'])
dataset['education']=np.where(dataset['education'] =='basic.6y', 'Basic', dataset['education'])
dataset['education']=np.where(dataset['education'] =='basic.4y', 'Basic', dataset['education'])
dataset['education'].unique()
array(['Basic', 'high.school', 'professional.course', 'unknown',
'university.degree', 'illiterate'], dtype=object)
dataset['y']=np.where(dataset['y'] =='no', 0, dataset['y'])
dataset['y']=np.where(dataset['y'] =='yes', 1, dataset['y'])
dataset['y'].value_counts()
0 36548
1 4640
Name: y, dtype: int64
#直方图
sns.countplot(x='y', data = dataset, palette='hls')
plt.show()
plt.savefig('count_plot')
[外链图片转存失败(img-TsH0RTkS-1565963315031)(output_10_0.png)]
#开户统计
count_no_sub = len(dataset[dataset['y']==0])
count_sub = len(dataset[dataset['y']==1])
pct_of_no_sub = count_no_sub/(count_no_sub+count_sub)
print('未开户的百分比: %.2f%%' % (pct_of_no_sub*100))
pct_of_sub = count_sub/(count_no_sub+count_sub)
print('开户的百分比: %.2f%%' % (pct_of_sub*100))
#用均值查看一下0与1的情况
dataset.groupby('y').mean()
age | duration | campaign | pdays | previous | emp.var.rate | cons.price.idx | cons.conf.idx | euribor3m | nr.employed | ... | month_oct | month_sep | day_of_week_fri | day_of_week_mon | day_of_week_thu | day_of_week_tue | day_of_week_wed | poutcome_failure | poutcome_nonexistent | poutcome_success | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
y | |||||||||||||||||||||
0 | 39.911185 | 220.844807 | 2.633085 | 984.113878 | 0.132374 | 0.248875 | 93.603757 | -40.593097 | 3.811491 | 5176.166600 | ... | 0.011027 | 0.008591 | 0.191009 | 0.209779 | 0.207344 | 0.195277 | 0.196591 | 0.099787 | 0.887107 | 0.013106 |
1 | 40.913147 | 553.191164 | 2.051724 | 792.035560 | 0.492672 | -1.233448 | 93.354386 | -39.789784 | 2.123135 | 5095.115991 | ... | 0.067888 | 0.055172 | 0.182328 | 0.182543 | 0.225216 | 0.205388 | 0.204526 | 0.130388 | 0.676940 | 0.192672 |
2 rows × 61 columns
#计算其他特征值(如教育和婚姻状况)的分布
#data.groupby('job').mean()
#data.groupby('marital').mean()
#data.groupby('education').mean()
%matplotlib inline
table=pd.crosstab(dataset.job,dataset.y)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of Job title vs Purchase')
plt.xlabel('Job')
plt.ylabel('Proportion of Purchase')
plt.savefig('purchase_vs_job')
#我们发现具有不同职位的人购买存款的频率不一样。 因此,职称可以是良好的预测因素。
[外链图片转存失败(img-JPv7Bo5B-1565963315040)(output_14_0.png)]
table=pd.crosstab(dataset.marital,dataset.y)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of Marital Status vs Purchase')
plt.xlabel('Marital Status')
plt.ylabel('Proportion of Customers')
plt.savefig('mariral_vs_pur_stack')
#婚姻状况似乎不是好的预测因素
[外链图片转存失败(img-dEIRor9r-1565963315045)(output_15_0.png)]
#下面是教育属性
table=pd.crosstab(dataset.education,dataset.y)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of Education vs Purchase')
plt.xlabel('Education')
plt.ylabel('Proportion of Customers')
plt.savefig('edu_vs_pur_stack')
#可以看出教育似乎是结果变量的良好预测指标
[外链图片转存失败(img-HRdTNPbZ-1565963315051)(output_16_0.png)]
#下面是时间特征
table=pd.crosstab(dataset.day_of_week,dataset.y)#.plot(kind='bar')
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of Day of Week vs Purchase')
plt.xlabel('Day of Week')
plt.ylabel('Proportion of Purchase')
plt.savefig('dow_vs_purchase')
#一周工作时间不是预测结果的良好预测因素
[外链图片转存失败(img-miRajsCk-1565963315054)(output_17_0.png)]
cat_vars=['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome']
print(dataset.shape)
print(dataset[cat_vars].head())
data_final = pd.DataFrame(columns = [])
data_final = dataset
data_final = data_final.drop(cat_vars,axis=1)
#将离散型数据one-hot编码后接入原数据集
for var in cat_vars:
cat_list = pd.get_dummies(dataset[var], prefix=var) #将数据集中某一列进行数值编码
#print(cat_list)
data_final=data_final.join(cat_list) #这样便于算法处理
#data_final = pd.get_dummies(dataset[cat_vars], prefix=cat_vars)
print(data_final.columns)
print(data_final.shape)
(41188, 21)
job marital education default housing loan contact month \
0 housemaid married Basic no no no telephone may
1 services married high.school unknown no no telephone may
2 services married high.school no yes no telephone may
3 admin. married Basic no no no telephone may
4 services married high.school no no yes telephone may
day_of_week poutcome
0 mon nonexistent
1 mon nonexistent
2 mon nonexistent
3 mon nonexistent
4 mon nonexistent
Index(['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate',
'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'y',
'job_admin.', 'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
'job_management', 'job_retired', 'job_self-employed', 'job_services',
'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
'marital_divorced', 'marital_married', 'marital_single',
'marital_unknown', 'education_Basic', 'education_high.school',
'education_illiterate', 'education_professional.course',
'education_university.degree', 'education_unknown', 'default_no',
'default_unknown', 'default_yes', 'housing_no', 'housing_unknown',
'housing_yes', 'loan_no', 'loan_unknown', 'loan_yes',
'contact_cellular', 'contact_telephone', 'month_apr', 'month_aug',
'month_dec', 'month_jul', 'month_jun', 'month_mar', 'month_may',
'month_nov', 'month_oct', 'month_sep', 'day_of_week_fri',
'day_of_week_mon', 'day_of_week_thu', 'day_of_week_tue',
'day_of_week_wed', 'poutcome_failure', 'poutcome_nonexistent',
'poutcome_success'],
dtype='object')
(41188, 62)
data_final[data_final['y']=='unknown']
age | duration | campaign | pdays | previous | emp.var.rate | cons.price.idx | cons.conf.idx | euribor3m | nr.employed | ... | month_oct | month_sep | day_of_week_fri | day_of_week_mon | day_of_week_thu | day_of_week_tue | day_of_week_wed | poutcome_failure | poutcome_nonexistent | poutcome_success |
---|
0 rows × 62 columns
X = data_final.loc[:, data_final.columns != 'y']
y = data_final.loc[:, data_final.columns == 'y'].values.ravel()
print(X.shape)
print(y.shape)
from imblearn.over_sampling import SMOTE
os = SMOTE(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
columns = X_train.columns
os_data_X,os_data_y=os.fit_sample(X_train, y_train.astype('int'))
os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
os_data_y= pd.DataFrame(data=os_data_y,columns=['y'])
# we can Check the numbers of our data
print("过采样以后的数据量: ",len(os_data_X))
print("未开户的用户数量: ",len(os_data_y[os_data_y['y']==0]))
print("开户的用户数量: ",len(os_data_y[os_data_y['y']==1]))
print("未开户的用户数量的百分比: ",len(os_data_y[os_data_y['y']==0])/len(os_data_X))
print("开户的用户数量的百分比: ",len(os_data_y[os_data_y['y']==1])/len(os_data_X))
(41188, 61)
(41188,)
过采样以后的数据量: 51158
未开户的用户数量: 25579
开户的用户数量: 25579
未开户的用户数量的百分比: 0.5
开户的用户数量的百分比: 0.5
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
logreg = LogisticRegression(solver='liblinear')
logreg.fit(os_data_X, os_data_y.values.reshape(-1))
y_pred = logreg.predict(X_test)
print('在测试数据集上面的预测准确率: {:.2f}'.format(logreg.score(X_test, y_test.astype('int'))))
在测试数据集上面的预测准确率: 0.87
from sklearn.metrics import classification_report
print(classification_report(y_test.astype('int'), y_pred))
precision recall f1-score support
0 0.98 0.86 0.92 10969
1 0.45 0.89 0.60 1388
micro avg 0.87 0.87 0.87 12357
macro avg 0.72 0.88 0.76 12357
weighted avg 0.92 0.87 0.88 12357
# 最终效果。
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test.astype('int'), logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test.astype('int'), logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
<Figure size 432x288 with 0 Axes>
[外链图片转存失败(img-QjlgmeqK-1565963315060)(output_23_1.png)]