#使用逻辑回归实现银行营销
#导入数据集,这分割符号为';'
#-----------------------------------------1、导入数据集-----------------------------------------------
import pandas as pd
bankSet=pd.read_csv('../MLinAction_source/bank-full.csv',sep=';')
#检查是否有缺失值
print(bankSet.isnull().sum())
#-----------------------------------------2、数据预处理-----------------------------------------------
#preprocessing.OrdinalEncoder:特征专用,能够将分类特征转化为分类数值
#preprocessing.LabelEncoder:(允许输入一维数据) 标签专用,能够将分类转换为分类数值
#preprocessing.KBinsDiscretizer:这是将连续型变量划分为分类变量的类,能够将连续型变量排列后按顺序分箱后编码
#将连续型变量分箱编码为分类变量
from sklearn.preprocessing import KBinsDiscretizer
#将age/duration/day字段编码为三分类变量
est1=KBinsDiscretizer(n_bins=3,encode='ordinal',strategy='kmeans') #分箱
x1=bankSet.loc[:,['age','duration','day']]
print(bankSet['age'].value_counts())
bankSet.loc[:,['age','duration','day']]=est1.fit_transform(x1)
#将balance/campaign/pdays/prevoious字段编码为二分类变量
est2=KBinsDiscretizer(n_bins=2,encode='ordinal',strategy='kmeans')
x2=bankSet.loc[:,['balance','campaign','pdays','previous']]
bankSet.loc[:,['balance','campaign','pdays','previous']]=est2.fit_transform(x2)
#查看编码后效果
print(bankSet.head())
print(bankSet['age'].value_counts())
print(bankSet['balance'].value_counts())
#将分类特征转换为分类数值
from sklearn.preprocessing import OrdinalEncoder
bankSet.iloc[:,:]=OrdinalEncoder().fit_transform(bankSet)
print(bankSet.head())
#-----------------------------------经过以上步骤都变成分类类型的数值了-------------------------------------------
#-----------------------------------------3、切分训练集和测试集-----------------------------------------------
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(bankSet.iloc[:,:-1],
bankSet.iloc[:,-1],
test_size=0.25,
random_state=0)
print(x_train.shape)
print(x_test.shape)
#-----------------------------------------4、构建逻辑回归分类函数-----------------------------------------------
from sklearn.linear_model import LogisticRegression
#建模
classifier=LogisticRegression(random_state=0)
classifier.fit(x_train,y_train)
#预测
y_pred=classifier.predict(x_test)
#计算模型准确率
from sklearn.metrics import accuracy_score
print('准确率:',accuracy_score(y_test,y_pred))
数据集:
链接
提取码:3ig5