数据挖掘学习笔记

数据挖掘学习笔记


现有一组facebook数据,x1-x10为已知属性,y为要预测的变量,首先导包,和数据。

import numpy as np
import pandas as pd
from statsmodels.formula.api import ols
import statsmodels.api as sm
## 导入数据
train_facebook = pd.read_csv(r"E:\wechart\train_facebook.csv")  ## 完整数据
p_facebook = pd.read_csv(r"E:\wechart\train_facebook_p.csv")     ## 残缺数据
## 数据设定
train = p_facebook[0:10000]                                 ## 训练集
predict = p_facebook[10001:]                                 ## 要预测的
true_value = train_facebook[10001:]                          ## 真实值

predict = p_facebook[10001:] 
predict = predict.iloc[:,:-1]

观察数据发现x1,x3,x5,x7,x9的取值为字母H、C、D、S,而x2,x4,x6,x8,x10的取值为1-13的数字。
第一种思路,采取线性回归,来预测y的值

## 生成虚拟变量
predict['x1_H']=np.nan
predict['x1_C']=np.nan
predict['x1_D']=np.nan
predict['x3_H']=np.nan
predict['x3_C']=np.nan
predict['x3_D']=np.nan
predict['x5_H']=np.nan
predict['x5_C']=np.nan
predict['x5_D']=np.nan
predict['x7_H']=np.nan
predict['x7_C']=np.nan
predict['x7_D']=np.nan
predict['x9_H']=np.nan
predict['x9_C']=np.nan
predict['x9_D']=np.nan
for i in range(1,6):
    for j in range(0,8865):
        ## 按照x1,3,5,7,9是否为H,生成虚拟变量x_H
        if predict.iloc[j,2*i-1]=='H':
            predict.iloc[j,3*i+8] = 1
        else:
            predict.iloc[j,3*i+8] = 0
        ## 按照x1,3,5,7,9是否为C,生成虚拟变量x_C
        if predict.iloc[j,2*i-1]=='C':
            predict.iloc[j,3*i+9] = 1
        else:
            predict.iloc[j,3*i+9] = 0
        ## 按照x1,3,5,7,9是否为D,生成虚拟变量 x_D   
        if predict.iloc[j,2*i-1]=='D':
            predict.iloc[j,3*i+10] = 1
        else:
            predict.iloc[j,3*i+10] = 0
##  要预测的x的值的转换
train['x1_H']=np.nan
train['x1_C']=np.nan
train['x1_D']=np.nan
train['x3_H']=np.nan
train['x3_C']=np.nan
train['x3_D']=np.nan
train['x5_H']=np.nan
train['x5_C']=np.nan
train['x5_D']=np.nan
train['x7_H']=np.nan
train['x7_C']=np.nan
train['x7_D']=np.nan
train['x9_H']=np.nan
train['x9_C']=np.nan
train['x9_D']=np.nan
for i in range(1,6):
    for j in range(0,10000):
        ## 按照x1,3,5,7,9是否为H,生成虚拟变量x_H
        if train.iloc[j,2*i-1]=='H':
            train.iloc[j,3*i+9] = 1
        else:
            train.iloc[j,3*i+9] = 0
        ## 按照x1,3,5,7,9是否为C,生成虚拟变量x_C
        if train.iloc[j,2*i-1]=='C':
            train.iloc[j,3*i+10] = 1
        else:
            train.iloc[j,3*i+10] = 0
        ## 按照x1,3,5,7,9是否为D,生成虚拟变量 x_D   
        if train.iloc[j,2*i-1]=='D':
            train.iloc[j,3*i+11] = 1
        else:
            train.iloc[j,3*i+11] = 0

回归并预测

lm_s = ols('y ~ x2 + x4 + x6 + x8 + x10  + x1_H + x1_C + x1_D + x3_H + x3_C + x3_D + x5_H + x5_C +x5_D + x7_H+ x7_C  +x7_D+x9_H +x9_C+x9_D ', data=train,).fit()
print(lm_s.params)
lm_s.summary()
result=lm_s.predict(predict)
true_rate_ols=sum(round(result)==true_value['y'])/8865
true_rate_ols

准确率只有2.5%,准确率太低了,低于10%(随机猜测也有10%的准确率),无意义,
果然,用线性回归来预测这种离散的选择模型效果很差,不妨试试一些专门用于分类的算法

KNN算法

from  sklearn.neighbors  import KNeighborsClassifier
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib
## 将原始数据中的字母转换为数值
knn_train_x=train.iloc[:,1:11]
for i in range(0,9,2):
    knn_train_x.iloc[:,i][knn_train_x.iloc[:,i]=='D']=0
    knn_train_x.iloc[:,i][knn_train_x.iloc[:,i]=='C']=1
    knn_train_x.iloc[:,i][knn_train_x.iloc[:,i]=='H']=2
    knn_train_x.iloc[:,i][knn_train_x.iloc[:,i]=='S']=3        
knn_train_y=train.iloc[:,11]
knn_predict=predict.iloc[:,1:11]
for i in range(0,9,2):
    knn_predict.iloc[:,i][knn_predict.iloc[:,i]=='D']=0
    knn_predict.iloc[:,i][knn_predict.iloc[:,i]=='C']=1
    knn_predict.iloc[:,i][knn_predict.iloc[:,i]=='H']=2
    knn_predict.iloc[:,i][knn_predict.iloc[:,i]=='S']=3  
knn= KNeighborsClassifier(n_neighbors=5)
knn.fit(knn_train_x,knn_train_y)
knn_result =knn.predict(knn_predict)
true_rate_knn=sum(knn_result==true_value.iloc[:,-1])/8865
true_rate_knn

准确率有52.6%,还不错,能否继续增加准确率呢

## 调整参数,提升准确率
result_list=[]
n=40
for i in range(1,n):
    knn= KNeighborsClassifier(n_neighbors=i)
    knn.fit(knn_train_x,knn_train_y)
    knn_result =knn.predict(knn_predict)
    true_rate_knn=sum(knn_result==true_value.iloc[:,-1])/8865
    result_list.append(true_rate_knn)
x=[i for i in range(1,n)]
matplotlib.rcParams['font.family']='SimHei'  ##可以显示中文
plt.figure(figsize=(12, 9))
plt.plot(x,result_list)
plt.xlabel('n_neighbors',size=20)
plt.ylabel('准确率',size=20)

相关图
随着n的增加,准确率逐渐提升,n增加到1

  1. List item

0以后,增加不再明显,呈现波动状态,准确率为56%左右

result_list=[]
n=40
for i in range(1,n):
    knn= KNeighborsClassifier(n_neighbors=i,weights='distance',n_jobs=-1)
    knn.fit(knn_train_x,knn_train_y)
    knn_result =knn.predict(knn_predict)
    true_rate_knn=sum(knn_result==true_value.iloc[:,-1])/8865
    result_list.append(true_rate_knn)
x=[i for i in range(1,n)]
matplotlib.rcParams['font.family']='SimHei'  ##可以显示中文
plt.figure(figsize=(12, 9))
plt.plot(x,result_list)
plt.xlabel('n_neighbors',size=20)
plt.ylabel('准确率',size=20)

在这里插入图片描述
加权算法准确率提高了一些,有57%左右了

Logistic回归算法

线性回归效果很差,不妨试试专门用来处理选择模型的Logistic回归

from sklearn.linear_model import LogisticRegression
## 将原始数据中的字母转换为数值,与knn中一样
log_train_x=train.iloc[:,1:11]
for i in range(0,9,2):
    log_train_x.iloc[:,i][log_train_x.iloc[:,i]=='D']=0
    log_train_x.iloc[:,i][log_train_x.iloc[:,i]=='C']=1
    log_train_x.iloc[:,i][log_train_x.iloc[:,i]=='H']=2
    log_train_x.iloc[:,i][log_train_x.iloc[:,i]=='S']=3        
log_train_y=train.iloc[:,11]
log_predict=predict.iloc[:,1:11]
for i in range(0,9,2):
    log_predict.iloc[:,i][log_predict.iloc[:,i]=='D']=0
    log_predict.iloc[:,i][log_predict.iloc[:,i]=='C']=1
    log_predict.iloc[:,i][log_predict.iloc[:,i]=='H']=2
    log_predict.iloc[:,i][log_predict.iloc[:,i]=='S']=3  
lg=LogisticRegression()
lg.fit(log_train_x,log_train_y)
log_result=lg.predict(log_predict)
true_rate_log=sum(log_result==true_value.iloc[:,-1])/8865
true_rate_log
## 准确率接近50%,一般般

决策树算法

from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
tree_train_x = log_train_x
tree_train_y = log_train_y
tree_predict = log_predict
clf = DecisionTreeClassifier(criterion='gini')
clf.fit(tree_train_x,tree_train_y)
true_rate_tree=(clf.predict(tree_predict)==true_value.iloc[:,-1]).mean()
true_rate_tree
## 准确率为48.6%

随机森林算法

from sklearn.ensemble import RandomForestClassifier
forest_train_x = log_train_x
forest_train_y = log_train_y
forest_predict = log_predict
forest = RandomForestClassifier(n_estimators=500)
forest.fit(forest_train_x,forest_train_y)
true_rate_forest = (forest.predict(forest_predict)==true_value.iloc[:,-1]).mean()
true_rate_forest
## 准确率59% , 有所增加

极度随机森林算法

from sklearn.ensemble import ExtraTreesClassifier
extraTree = ExtraTreesClassifier(n_estimators= 100)
extraTree.fit(forest_train_x,forest_train_y)
true_rate_extraForest = (extraTree.predict(forest_predict)==true_value.iloc[:,-1]).mean()
true_rate_extraForest
## 准确率56%,还行

Adaboost算法`

from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(n_estimators=100)
ada.fit(forest_train_x,forest_train_y)
true_rate_Ada = (ada.predict(forest_predict)==true_value.iloc[:,-1]).mean()
true_rate_Ada
## 准确率48.6%

Gbdt算法

from sklearn.ensemble import GradientBoostingClassifier
gbdt = GradientBoostingClassifier()
gbdt.fit(forest_train_x,forest_train_y)
true_rate_Gbdt = (gbdt.predict(forest_predict)==true_value.iloc[:,-1]).mean()
true_rate_Gbdt
## 准确率60.5% ,继续提高

LGBM算法

# pip install lightgbm
from lightgbm import LGBMClassifier
## 将dataframe中的Object类型转换为int类型
forest_train_x['x1']=pd.to_numeric(forest_train_x['x1'])
forest_train_x['x3']=pd.to_numeric(forest_train_x['x3'])
forest_train_x['x5']=pd.to_numeric(forest_train_x['x5'])
forest_train_x['x7']=pd.to_numeric(forest_train_x['x7'])
forest_train_x['x9']=pd.to_numeric(forest_train_x['x9'])
forest_predict['x1']=pd.to_numeric(forest_predict['x1'])
forest_predict['x3']=pd.to_numeric(forest_predict['x3'])
forest_predict['x5']=pd.to_numeric(forest_predict['x5'])
forest_predict['x7']=pd.to_numeric(forest_predict['x7'])
forest_predict['x9']=pd.to_numeric(forest_predict['x9'])
lgbm = LGBMClassifier()
lgbm.fit(forest_train_x,forest_train_y)
true_rate_lgbm = (lgbm.predict(forest_predict)==true_value.iloc[:,-1]).mean()
true_rate_lgbm
## 准确率为61.5%,又有提升

xgbost算法

from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(forest_train_x,forest_train_y)
true_rate_xgb = (xgb.predict(forest_predict)==true_value.iloc[:,-1]).mean()
true_rate_xgb
## 准确率为69%,提升很大

高斯分布朴素贝叶斯算


from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
gnb = GaussianNB()
gnb.fit(forest_train_x,forest_train_y)
true_rate_gnb= (gnb.predict(forest_predict)==true_value.iloc[:,-1]).mean()
true_rate_gnb
##  准确率48.5%

多项式分布朴素贝叶斯

算法

mnb = MultinomialNB()
mnb.fit(forest_train_x,forest_train_y)
true_rate_mnb= (mnb.predict(forest_predict)==true_value.iloc[:,-1]).mean()
true_rate_mnb
##  准确率48.4%

伯努利分布朴素贝叶斯

算法

bnb = BernoulliNB()
bnb.fit(forest_train_x,forest_train_y)
true_rate_bnb= (bnb.predict(forest_predict)==true_value.iloc[:,-1]).mean()
true_rate_bnb
##  准确率49.8%

试了这么多种算法,除了第一种实验性质的线性回归之外,其余算法的准确率基本都有50%,准确率最高的是xgboost算法,准确率可以到达69%。

  • 3
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值