数据挖掘学习笔记
现有一组facebook数据,x1-x10为已知属性,y为要预测的变量,首先导包,和数据。
import numpy as np
import pandas as pd
from statsmodels.formula.api import ols
import statsmodels.api as sm
## 导入数据
train_facebook = pd.read_csv(r"E:\wechart\train_facebook.csv") ## 完整数据
p_facebook = pd.read_csv(r"E:\wechart\train_facebook_p.csv") ## 残缺数据
## 数据设定
train = p_facebook[0:10000] ## 训练集
predict = p_facebook[10001:] ## 要预测的
true_value = train_facebook[10001:] ## 真实值
predict = p_facebook[10001:]
predict = predict.iloc[:,:-1]
观察数据发现x1,x3,x5,x7,x9的取值为字母H、C、D、S,而x2,x4,x6,x8,x10的取值为1-13的数字。
第一种思路,采取线性回归,来预测y的值
## 生成虚拟变量
predict['x1_H']=np.nan
predict['x1_C']=np.nan
predict['x1_D']=np.nan
predict['x3_H']=np.nan
predict['x3_C']=np.nan
predict['x3_D']=np.nan
predict['x5_H']=np.nan
predict['x5_C']=np.nan
predict['x5_D']=np.nan
predict['x7_H']=np.nan
predict['x7_C']=np.nan
predict['x7_D']=np.nan
predict['x9_H']=np.nan
predict['x9_C']=np.nan
predict['x9_D']=np.nan
for i in range(1,6):
for j in range(0,8865):
## 按照x1,3,5,7,9是否为H,生成虚拟变量x_H
if predict.iloc[j,2*i-1]=='H':
predict.iloc[j,3*i+8] = 1
else:
predict.iloc[j,3*i+8] = 0
## 按照x1,3,5,7,9是否为C,生成虚拟变量x_C
if predict.iloc[j,2*i-1]=='C':
predict.iloc[j,3*i+9] = 1
else:
predict.iloc[j,3*i+9] = 0
## 按照x1,3,5,7,9是否为D,生成虚拟变量 x_D
if predict.iloc[j,2*i-1]=='D':
predict.iloc[j,3*i+10] = 1
else:
predict.iloc[j,3*i+10] = 0
## 要预测的x的值的转换
train['x1_H']=np.nan
train['x1_C']=np.nan
train['x1_D']=np.nan
train['x3_H']=np.nan
train['x3_C']=np.nan
train['x3_D']=np.nan
train['x5_H']=np.nan
train['x5_C']=np.nan
train['x5_D']=np.nan
train['x7_H']=np.nan
train['x7_C']=np.nan
train['x7_D']=np.nan
train['x9_H']=np.nan
train['x9_C']=np.nan
train['x9_D']=np.nan
for i in range(1,6):
for j in range(0,10000):
## 按照x1,3,5,7,9是否为H,生成虚拟变量x_H
if train.iloc[j,2*i-1]=='H':
train.iloc[j,3*i+9] = 1
else:
train.iloc[j,3*i+9] = 0
## 按照x1,3,5,7,9是否为C,生成虚拟变量x_C
if train.iloc[j,2*i-1]=='C':
train.iloc[j,3*i+10] = 1
else:
train.iloc[j,3*i+10] = 0
## 按照x1,3,5,7,9是否为D,生成虚拟变量 x_D
if train.iloc[j,2*i-1]=='D':
train.iloc[j,3*i+11] = 1
else:
train.iloc[j,3*i+11] = 0
回归并预测
lm_s = ols('y ~ x2 + x4 + x6 + x8 + x10 + x1_H + x1_C + x1_D + x3_H + x3_C + x3_D + x5_H + x5_C +x5_D + x7_H+ x7_C +x7_D+x9_H +x9_C+x9_D ', data=train,).fit()
print(lm_s.params)
lm_s.summary()
result=lm_s.predict(predict)
true_rate_ols=sum(round(result)==true_value['y'])/8865
true_rate_ols
准确率只有2.5%,准确率太低了,低于10%(随机猜测也有10%的准确率),无意义,
果然,用线性回归来预测这种离散的选择模型效果很差,不妨试试一些专门用于分类的算法
KNN算法
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib
## 将原始数据中的字母转换为数值
knn_train_x=train.iloc[:,1:11]
for i in range(0,9,2):
knn_train_x.iloc[:,i][knn_train_x.iloc[:,i]=='D']=0
knn_train_x.iloc[:,i][knn_train_x.iloc[:,i]=='C']=1
knn_train_x.iloc[:,i][knn_train_x.iloc[:,i]=='H']=2
knn_train_x.iloc[:,i][knn_train_x.iloc[:,i]=='S']=3
knn_train_y=train.iloc[:,11]
knn_predict=predict.iloc[:,1:11]
for i in range(0,9,2):
knn_predict.iloc[:,i][knn_predict.iloc[:,i]=='D']=0
knn_predict.iloc[:,i][knn_predict.iloc[:,i]=='C']=1
knn_predict.iloc[:,i][knn_predict.iloc[:,i]=='H']=2
knn_predict.iloc[:,i][knn_predict.iloc[:,i]=='S']=3
knn= KNeighborsClassifier(n_neighbors=5)
knn.fit(knn_train_x,knn_train_y)
knn_result =knn.predict(knn_predict)
true_rate_knn=sum(knn_result==true_value.iloc[:,-1])/8865
true_rate_knn
准确率有52.6%,还不错,能否继续增加准确率呢
## 调整参数,提升准确率
result_list=[]
n=40
for i in range(1,n):
knn= KNeighborsClassifier(n_neighbors=i)
knn.fit(knn_train_x,knn_train_y)
knn_result =knn.predict(knn_predict)
true_rate_knn=sum(knn_result==true_value.iloc[:,-1])/8865
result_list.append(true_rate_knn)
x=[i for i in range(1,n)]
matplotlib.rcParams['font.family']='SimHei' ##可以显示中文
plt.figure(figsize=(12, 9))
plt.plot(x,result_list)
plt.xlabel('n_neighbors',size=20)
plt.ylabel('准确率',size=20)
随着n的增加,准确率逐渐提升,n增加到1
- List item
0以后,增加不再明显,呈现波动状态,准确率为56%左右
result_list=[]
n=40
for i in range(1,n):
knn= KNeighborsClassifier(n_neighbors=i,weights='distance',n_jobs=-1)
knn.fit(knn_train_x,knn_train_y)
knn_result =knn.predict(knn_predict)
true_rate_knn=sum(knn_result==true_value.iloc[:,-1])/8865
result_list.append(true_rate_knn)
x=[i for i in range(1,n)]
matplotlib.rcParams['font.family']='SimHei' ##可以显示中文
plt.figure(figsize=(12, 9))
plt.plot(x,result_list)
plt.xlabel('n_neighbors',size=20)
plt.ylabel('准确率',size=20)
加权算法准确率提高了一些,有57%左右了
Logistic回归算法
线性回归效果很差,不妨试试专门用来处理选择模型的Logistic回归
from sklearn.linear_model import LogisticRegression
## 将原始数据中的字母转换为数值,与knn中一样
log_train_x=train.iloc[:,1:11]
for i in range(0,9,2):
log_train_x.iloc[:,i][log_train_x.iloc[:,i]=='D']=0
log_train_x.iloc[:,i][log_train_x.iloc[:,i]=='C']=1
log_train_x.iloc[:,i][log_train_x.iloc[:,i]=='H']=2
log_train_x.iloc[:,i][log_train_x.iloc[:,i]=='S']=3
log_train_y=train.iloc[:,11]
log_predict=predict.iloc[:,1:11]
for i in range(0,9,2):
log_predict.iloc[:,i][log_predict.iloc[:,i]=='D']=0
log_predict.iloc[:,i][log_predict.iloc[:,i]=='C']=1
log_predict.iloc[:,i][log_predict.iloc[:,i]=='H']=2
log_predict.iloc[:,i][log_predict.iloc[:,i]=='S']=3
lg=LogisticRegression()
lg.fit(log_train_x,log_train_y)
log_result=lg.predict(log_predict)
true_rate_log=sum(log_result==true_value.iloc[:,-1])/8865
true_rate_log
## 准确率接近50%,一般般
决策树算法
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
tree_train_x = log_train_x
tree_train_y = log_train_y
tree_predict = log_predict
clf = DecisionTreeClassifier(criterion='gini')
clf.fit(tree_train_x,tree_train_y)
true_rate_tree=(clf.predict(tree_predict)==true_value.iloc[:,-1]).mean()
true_rate_tree
## 准确率为48.6%
随机森林算法
from sklearn.ensemble import RandomForestClassifier
forest_train_x = log_train_x
forest_train_y = log_train_y
forest_predict = log_predict
forest = RandomForestClassifier(n_estimators=500)
forest.fit(forest_train_x,forest_train_y)
true_rate_forest = (forest.predict(forest_predict)==true_value.iloc[:,-1]).mean()
true_rate_forest
## 准确率59% , 有所增加
极度随机森林算法
from sklearn.ensemble import ExtraTreesClassifier
extraTree = ExtraTreesClassifier(n_estimators= 100)
extraTree.fit(forest_train_x,forest_train_y)
true_rate_extraForest = (extraTree.predict(forest_predict)==true_value.iloc[:,-1]).mean()
true_rate_extraForest
## 准确率56%,还行
Adaboost算法`
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(n_estimators=100)
ada.fit(forest_train_x,forest_train_y)
true_rate_Ada = (ada.predict(forest_predict)==true_value.iloc[:,-1]).mean()
true_rate_Ada
## 准确率48.6%
Gbdt算法
from sklearn.ensemble import GradientBoostingClassifier
gbdt = GradientBoostingClassifier()
gbdt.fit(forest_train_x,forest_train_y)
true_rate_Gbdt = (gbdt.predict(forest_predict)==true_value.iloc[:,-1]).mean()
true_rate_Gbdt
## 准确率60.5% ,继续提高
LGBM算法
# pip install lightgbm
from lightgbm import LGBMClassifier
## 将dataframe中的Object类型转换为int类型
forest_train_x['x1']=pd.to_numeric(forest_train_x['x1'])
forest_train_x['x3']=pd.to_numeric(forest_train_x['x3'])
forest_train_x['x5']=pd.to_numeric(forest_train_x['x5'])
forest_train_x['x7']=pd.to_numeric(forest_train_x['x7'])
forest_train_x['x9']=pd.to_numeric(forest_train_x['x9'])
forest_predict['x1']=pd.to_numeric(forest_predict['x1'])
forest_predict['x3']=pd.to_numeric(forest_predict['x3'])
forest_predict['x5']=pd.to_numeric(forest_predict['x5'])
forest_predict['x7']=pd.to_numeric(forest_predict['x7'])
forest_predict['x9']=pd.to_numeric(forest_predict['x9'])
lgbm = LGBMClassifier()
lgbm.fit(forest_train_x,forest_train_y)
true_rate_lgbm = (lgbm.predict(forest_predict)==true_value.iloc[:,-1]).mean()
true_rate_lgbm
## 准确率为61.5%,又有提升
xgbost算法
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(forest_train_x,forest_train_y)
true_rate_xgb = (xgb.predict(forest_predict)==true_value.iloc[:,-1]).mean()
true_rate_xgb
## 准确率为69%,提升很大
高斯分布朴素贝叶斯算
法
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
gnb = GaussianNB()
gnb.fit(forest_train_x,forest_train_y)
true_rate_gnb= (gnb.predict(forest_predict)==true_value.iloc[:,-1]).mean()
true_rate_gnb
## 准确率48.5%
多项式分布朴素贝叶斯
算法
mnb = MultinomialNB()
mnb.fit(forest_train_x,forest_train_y)
true_rate_mnb= (mnb.predict(forest_predict)==true_value.iloc[:,-1]).mean()
true_rate_mnb
## 准确率48.4%
伯努利分布朴素贝叶斯
算法
bnb = BernoulliNB()
bnb.fit(forest_train_x,forest_train_y)
true_rate_bnb= (bnb.predict(forest_predict)==true_value.iloc[:,-1]).mean()
true_rate_bnb
## 准确率49.8%
试了这么多种算法,除了第一种实验性质的线性回归之外,其余算法的准确率基本都有50%,准确率最高的是xgboost算法,准确率可以到达69%。