汽车评估的数据集,包含了6个属性和1个分类目标。这6个属性是购买价格、维修价格、门数量、座位数量、载货能力和安全性,分类目标是汽车的评估,包括4个取值:不可接受(unacc)、一般(acc)、好(good)、很好(vgood)。数据集共有1728个样本。这个数据集是由K. Sakthivel和S. Balakrishnam在1997年发布的。这个数据集被广泛用于分类算法的研究和实践。
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics
def naive_bayes(X_train, y_train, X_test):
# Get unique labels from train_data
labels = set(y_train)
# Calculate prior probability for each label
priors = {label: sum(y_train == label) / len(y_train)
for label in labels}
# Get unique feature values for each feature
feature_values = [set(X_train[:, i]) for i in range(X_train.shape[1])]
# Train the model by calculating likelihood probability for each feature
likelihoods = {(label, i, value): (sum(np.logical_and(y_train == label, X_train[:, i] == value)) + 1)
/ (sum(y_train == label) + len(feature_values[i]))
for label in labels
for i in range(X_train.shape[1])
for value in feature_values[i]}
# Calculate posterior probability for each label and return the label with highest probability
predictions = []
for test in X_test:
probabilities = {label: math.log(priors[label])
+ sum(math.log(likelihoods.get((label, i, test[i]), 1e-10)) for i in range(X_train.shape[1]))
for label in labels}
predictions.append(max(probabilities, key=probabilities.get))
return predictions
# Load the Car Evaluation dataset
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data', header=None)
# Convert string values to numerical values
df = df.replace({'vhigh': 4, 'high': 3, 'med': 2, 'low': 1, '5more': 5, 'more': 6, 'small': 1, 'big': 3, 'high': 3,
'low': 1, 'unacc': 0, 'acc': 1, 'good': 2, 'vgood': 3})
# Extract features and target
X = np.array(df.iloc[:, :-1])
y = np.array(df.iloc[:, -1])
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Fit the model
predictions = naive_bayes(X_train, y_train, X_test)
# Evaluate the accuracy of the model
accuracy = metrics.accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")
读取数据集:`df = pd.read_csv(‘https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data’, header=None)