【Python】机器学习模型在红酒质量等级评价中的案例应用
1.红酒质量数据
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
df = pd.read_csv('white wine data.csv')
df
fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7.4 | 0.700 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
1 | 7.8 | 0.880 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.9968 | 3.20 | 0.68 | 9.8 | 5 |
2 | 7.8 | 0.760 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.9970 | 3.26 | 0.65 | 9.8 | 5 |
3 | 11.2 | 0.280 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.9980 | 3.16 | 0.58 | 9.8 | 6 |
4 | 7.4 | 0.700 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2192 | 6.4 | 0.450 | 0.07 | 1.1 | 0.030 | 10.0 | 131.0 | 0.9905 | 2.97 | 0.28 | 10.8 | 5 |
2193 | 6.4 | 0.475 | 0.06 | 1.0 | 0.030 | 9.0 | 131.0 | 0.9904 | 2.97 | 0.29 | 10.8 | 5 |
2194 | 6.3 | 0.270 | 0.38 | 0.9 | 0.051 | 7.0 | 140.0 | 0.9926 | 3.45 | 0.50 | 10.5 | 7 |
2195 | 6.9 | 0.410 | 0.33 | 10.1 | 0.043 | 28.0 | 152.0 | 0.9968 | 3.20 | 0.52 | 9.4 | 5 |
2196 | 7.0 | 0.290 | 0.37 | 4.9 | 0.034 | 26.0 | 127.0 | 0.9928 | 3.17 | 0.44 | 10.8 | 6 |
2197 rows × 12 columns
2. 划分数据并训练机器学习模型
X,y = df.iloc[:,0:-1].values,df.iloc[:,-1].values.reshape([-1,1])
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# 对数据进行标准化处理, 主要是X_train
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss = ss.fit(X_train)
X_train_std = ss.fit_transform(X_train)
X_test_std = ss.fit_transform(X_test)
# ss1 = StandardScaler()
# ss1 = ss1.fit(y_train)
# y_train_std = ss1.fit_transform(y_train)
# y_test_std = ss1.fit_transform(y_test)
#--------------- Modllong
# SVM Classifier
def svm_classifier(train_x, train_y):
from sklearn.svm import SVC
model = SVC(kernel='rbf', probability=True)
model.fit(train_x, train_y)
return model
# KNN Classifier
def knn_classifier(train_x, train_y):
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(train_x, train_y)
return model
# Logistic Regression Classifier
def logistic_regression_classifier(train_x, train_y):
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(penalty='l2')
model.fit(train_x, train_y)
return model
# Random Forest Classifier
def random_forest_classifier(train_x, train_y):
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=8)
model.fit(train_x, train_y)
return model
train_x = X_train_std
train_y = y_train
model_svc = svm_classifier(train_x, train_y)
model_knn = knn_classifier(train_x, train_y)
model_logistic = logistic_regression_classifier(train_x, train_y)
model_rf = random_forest_classifier(train_x, train_y)
# ----------
y_svc = model_svc.predict(X_test_std)
y_knn = model_knn.predict(X_test_std)
y_logistic = model_logistic.predict(X_test_std)
y_rf = model_rf.predict(X_test_std)
# 结果分析
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
print('分类准确率为:',accuracy_score(y_test,y_svc),accuracy_score(y_test,y_knn),accuracy_score(y_test,y_logistic),accuracy_score(y_test,y_rf))
print('宏平均准确率:',precision_score(y_test,y_svc,average='macro'),precision_score(y_test,y_knn,average='macro'),precision_score(y_test,y_logistic,average='macro'),precision_score(y_test,y_rf,average='macro'))
print('微平均准确率:',precision_score(y_test,y_svc,average='micro'),precision_score(y_test,y_knn,average='micro'),precision_score(y_test,y_logistic,average='micro'),precision_score(y_test,y_rf,average='micro'))
print('宏平均召回率为:',recall_score(y_test,y_svc,average='macro'),recall_score(y_test,y_knn,average='macro'),recall_score(y_test,y_logistic,average='macro'),recall_score(y_test,y_rf,average='macro'))
print('微平均召回率为:',recall_score(y_test,y_svc,average='micro'),recall_score(y_test,y_knn,average='micro'),recall_score(y_test,y_logistic,average='micro'),recall_score(y_test,y_rf,average='micro'))
print('宏平均f1值为:',f1_score(y_test,y_svc,average='macro'),f1_score(y_test,y_knn,average='macro'),f1_score(y_test,y_logistic,average='macro'),f1_score(y_test,y_rf,average='macro'))
print('微平均f1值为:',f1_score(y_test,y_svc,average='micro'),f1_score(y_test,y_knn,average='micro'),f1_score(y_test,y_logistic,average='micro'),f1_score(y_test,y_rf,average='micro'))
# 误差评估
分类准确率为: 0.5818181818181818 0.5515151515151515 0.5575757575757576 0.6075757575757575
宏平均准确率: 0.280583340709923 0.2986745934975547 0.3499023740988492 0.3984623113419726
微平均准确率: 0.5818181818181818 0.5515151515151515 0.5575757575757576 0.6075757575757575
宏平均召回率为: 0.27642774299410267 0.27744523345842165 0.2858453966079045 0.3401924573344921
微平均召回率为: 0.5818181818181818 0.5515151515151515 0.5575757575757576 0.6075757575757575
宏平均f1值为: 0.27488434754737406 0.27950149117164064 0.29807537284434943 0.3552209623496858
微平均f1值为: 0.5818181818181818 0.5515151515151515 0.5575757575757576 0.6075757575757575