#分类算法大比拼
#-----------------------1、导入数据集-----------------------------
#导入数据
import pandas as pd
#导入数据
dataSet=pd.read_csv('../MLinAction_source/Social_Network_Ads.csv')
print(dataSet.head())
#探索数据
print(dataSet.shape)
print(dataSet.info())
#-----------------------2、编码预处理-----------------------------
#这里只需要处理性别这一个特征,所以用LabelEncoder,因为它允许输入一维数据
from sklearn.preprocessing import LabelEncoder
dataSet.loc[:,'Gender']=LabelEncoder().fit_transform(dataSet.loc[:,'Gender'])
print(dataSet.head(10))
#-----------------------3、切分训练集和测试集-----------------------------
#提取出特征矩阵和标签
x=dataSet.iloc[:,1:-1].values #舍弃第一列User ID
y=dataSet.iloc[:,-1].values #提取标签
#切分训练集和测试集
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,
test_size=0.25,
random_state=0)
#查看训练集和测试集
print("训练集:\n",x_train[:10])
print("测试集:\n",x_test[:10])
#数据标准化 (StandardScaler作用:针对每一个特征维度去均值和方差归一化) 均值为0 方差为1
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
x_train=sc.fit_transform(x_train)
x_test=sc.fit_transform(x_test)
#查看标准化后的训练集
print("标准化后的训练集:\n",x_train[:10])
print("标准化后的测试集:\n",x_test[:10])
#-----------------------4、各类算法建模-----------------------------
#-------------4.1 Logistic回归-----
#建模及预测
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
logreg=LogisticRegression()
logreg.fit(x_train,y_train)
y_pred=logreg.predict(x_test)
print("Logistic回归准确率:\n",round(accuracy_score(y_test,y_pred)*100,2))
#-------------4.2 KNN-----
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train,y_train)
y_pred=knn.predict(x_test)
print("KNN准确率:\n",round(accuracy_score(y_test,y_pred)*100,2))
#-------------4.3 高斯朴素贝叶斯-----
from sklearn.naive_bayes import GaussianNB
gaussian=GaussianNB()
gaussian.fit(x_train,y_train)
y_pred=gaussian.predict(x_test)
print("高斯朴素贝叶斯准确率:\n",round(accuracy_score(y_test,y_pred)*100,2))
#-------------4.4 决策树-----
from sklearn.tree import DecisionTreeClassifier
decision_tree=DecisionTreeClassifier()
decision_tree.fit(x_train,y_train)
y_pred=decision_tree.predict(x_test)
print("决策树:\n",round(accuracy_score(y_test,y_pred)*100,2))
#-------------4.5 随机森林-----
from sklearn.ensemble import RandomForestClassifier
random_forest=RandomForestClassifier(n_estimators=100)
random_forest.fit(x_train,y_train)
y_pred=random_forest.predict(x_test)
print("随机森林:\n",round(accuracy_score(y_test,y_pred)*100,2))
数据集:
链接
提取码:p28b