数据来源:
选自UCI机器学习库中的「银行营销数据集(Bank Marketing Data Set)」
算法完成目标:
这些数据与葡萄牙银行机构的营销活动相关。这些营销活动以电话为基础,一般,银行的客服人员需要联系客户至少一次,以此确认客户是否将认购该银行的产品(定期存款)。我们的目的是预测研究客户定期存款业务的概率,可能性。
代码输出的示例:
ID表示客户唯一标识,pred表示预测客户订购定期存款业务的概率。
ID | Pred |
---|
25318 | 0.123456 |
25319 | 0.654321 |
25320 | 0.799212 |
代码中的主要思路:
首先对数据进行预处理,将数据进行分割,进行归一化处理,将数据划分为训练集和测试集,
在数据中将“'job','marital','education','default','housing','loan',
'contact','poutcome','month'”的字符型数据进行编码,然后运用K近邻的方法
建立模型,然后完成对数据的预测分析。
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"ename": "FileNotFoundError",
"evalue": "[Errno 2] No such file or directory: './model/encoder0.pkl'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-3-4e2fb6df1a4a>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 92\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0m__name__\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'__main__'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 93\u001b[0m \u001b[0;31m# train()\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 94\u001b[0;31m \u001b[0mloader\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mModel_Loader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 95\u001b[0m \u001b[0mpredict\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mloader\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfetch_predict_value\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 96\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m<ipython-input-3-4e2fb6df1a4a>\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 54\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencoder_list\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 55\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m9\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 56\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencoder_list\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjoblib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'./model/encoder'\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m'.pkl'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 57\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mss\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mjoblib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'./model/ss.pkl'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 58\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0malgo\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mjoblib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'./model/svm.pkl'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/numpy_pickle.py\u001b[0m in \u001b[0;36mload\u001b[0;34m(filename, mmap_mode)\u001b[0m\n\u001b[1;32m 568\u001b[0m \u001b[0mobj\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_unpickle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfobj\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 569\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 570\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'rb'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 571\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0m_read_fileobject\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmmap_mode\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mfobj\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 572\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_basestring\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: './model/encoder0.pkl'"
]
}
],
"source": [
"from sklearn.svm import SVC\n",
"import pandas as pd\n",
"from sklearn.preprocessing import StandardScaler,LabelEncoder\n",
"from sklearn.model_selection import train_test_split\n",
"import numpy as np\n",
"from sklearn.externals import joblib\n",
"\n",
"#设定随机种子 防止模型效果不稳定\n",
"np.random.seed(50)\n",
"def train():\n",
" #1.加载数据\n",
" path='.//train_set.csv'\n",
" reader = pd.read_csv(path, sep=',',iterator=True,engine='python')\n",
" df = reader.get_chunk(25318)\n",
"\n",
" #2.数据处理\n",
" #对数据进行分割处理 划分x,y\n",
" X=df.iloc[:,1:-1]\n",
" Y=df.iloc[:,-1]\n",
" # print(Y.shape)\n",
"\n",
" # 3.特征工程\n",
" #对x 中的字符型数据进行编码 变成数值型数据 1,2,3,....\n",
" encoder=LabelEncoder()\n",
" X_list=['job','marital','education','default','housing','loan','contact','poutcome','month']\n",
" for index,e in enumerate(X_list):\n",
" X[e] = encoder.fit_transform(X[e]) #对每一个属性列进行操作\n",
" joblib.dump(encoder, './model/encoder'+str(index)+'.pkl') #将每次的参数保存\n",
" #对各行文本进行归一化\n",
" ss=StandardScaler() #将每列数据归一化\n",
" X=ss.fit_transform(X)\n",
"\n",
" #划分训练集和测试集\n",
" x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2) #测试集占比0.2\n",
"\n",
" #构建模型\n",
" svm=SVC(kernel='rbf',gamma=0.16,C=1.68,probability=True) #不断调参 得出的参数\n",
" svm.fit(x_train,y_train)\n",
"\n",
" #查看模型效果\n",
" print('测试集分类效果:{}'.format(svm.score(x_test,y_test)))\n",
" print('训练集分类效果:{}'.format(svm.score(x_train,y_train)))\n",
"\n",
" print(\"测试集每个客户的概率{}\".format(svm.predict_proba(x_test)))\n",
" #模型持久化\n",
" joblib.dump(ss, './model/ss.pkl') #保存模型参数\n",
" joblib.dump(svm, './model/svm.pkl')\n",
"\n",
"\n",
"#建立加载模型函数\n",
"class Model_Loader(object):\n",
" def __init__(self):\n",
" # 1. 加载恢复模型\n",
" self.encoder_list=[]\n",
" for i in range(9):\n",
" self.encoder_list.append(joblib.load('./model/encoder'+str(i)+'.pkl'))\n",
" self.ss = joblib.load('./model/ss.pkl')\n",
" self.algo = joblib.load('./model/svm.pkl')\n",
" def fetch_predict_value(self):\n",
" #导入预测数据\n",
" path1='.//test_set.csv'\n",
" reader = pd.read_csv(path1, sep=',', iterator=True, engine='python')\n",
" df = reader.get_chunk(10000)\n",
" x_test = df.iloc[:, 1:]\n",
" #下面是对每个列属性的数据进行数值化操作\n",
" # X_list = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome', 'month']\n",
" x_test['job']=self.encoder_list[0].transform(x_test['job'])\n",
" x_test['marital']=self.encoder_list[1].transform(x_test['marital'])\n",
" x_test['education']=self.encoder_list[2].transform(x_test['education'])\n",
" x_test['default']=self.encoder_list[3].transform(x_test['default'])\n",
" x_test['housing']=self.encoder_list[4].transform(x_test['housing'])\n",
" x_test['loan']=self.encoder_list[5].transform(x_test['loan'])\n",
" x_test['contact']=self.encoder_list[6].transform(x_test['contact'])\n",
" x_test['poutcome']=self.encoder_list[7].transform(x_test['poutcome'])\n",
" x_test['month'] = self.encoder_list[8].transform(x_test['month'])\n",
"\n",
" #将每一个属性列进行归一化操作\n",
" x_test=self.ss.transform(x_test)\n",
" #预测数据\n",
" predict_proba=self.algo.predict_proba(x_test)\n",
" _predict_proba=[]\n",
" #将每个预测为成功的概率取出\n",
" for i in predict_proba:\n",
" _predict_proba.append(i[1])\n",
" #将x_test的ID值与x_test的预测值拼接在一起\n",
" x_predict=pd.concat([df.iloc[:,0],pd.DataFrame(_predict_proba)],axis=1)\n",
" return x_predict\n",
"\n",
"\n",
"\n",
"\n",
"if __name__ == '__main__':\n",
" # train()\n",
" loader=Model_Loader()\n",
" predict=loader.fetch_predict_value()\n",
" \n",
" test=pd.DataFrame(data =predict)\n",
" test.to_csv('F:/test2.csv')\n",
" print(predict)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}