4个分类器分类cifar_10_batches性能比较

4个分类器的分类性能比较

1. 题目要求
给定的图像数据集,利用KNN、朴素贝叶斯、线性回归、逻辑回归比较分类性能。
主要的数据集cifar_10_batches(60000张图,(32,32),10类,5个训练集,一个测试集)
要求:输出各个数据集的分类精度。
2. 说明
本次代码主要加上如何对上次代码KNN、朴素贝叶斯、线性回归、逻辑回归的分类性能比较进行优化完善,(pandas输出,消除了警告)整体框架不变,主要加入如何读取cifar_10_batches的读取,该方法来自chaixl_Hello_World,有一些问题已经修正。
3. 代码实现

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import pandas as pd
###
# 聚类精度模板
import numpy as np
from scipy.optimize import linear_sum_assignment
def cluster_acc(y_true, y_pred):
    y_true = np.array(y_true).astype(np.int64)
    assert y_pred.size == y_true.size
    D = max(y_pred.max(), y_true.max()) + 1
    w = np.zeros((D, D), dtype=np.int64)
    for i in range(y_pred.size):
        w[y_pred[i], y_true[i]] += 1
    ind = linear_sum_assignment(w.max() - w)
    ind = np.asarray(ind)
    ind = np.transpose(ind)
    return sum([w[i, j] for i, j in ind]) * 1.0 / y_pred.size
#
## 朴素贝叶斯与KNN分类
def knn_gnb_lr_lsr(X, labels, title_knn="XXX.KNN",\
                title_gnb="XXX.GNB",title_lr="XXX.LR",\
                title_lsr="XXX=LSR",n = 3):
    # 划分训练集和测试集
    x_train, x_test, labels_train, labels_test =\
        train_test_split(X, labels, test_size=0.2, random_state=22)

    # 使用KNN进行分类
    knn = KNeighborsClassifier()
    knn.fit(x_train, labels_train)
    label_sample = knn.predict(x_test)
    knn_acc=cluster_acc(labels_test, label_sample)
    print(title_knn,"=",knn_acc)

    # 使用高斯朴素贝叶斯进行分类
    gnb = GaussianNB()  # 使用默认配置初始化朴素贝叶斯
    gnb.fit(x_train, labels_train)  # 训练模型
    label_sample = gnb.predict(x_test)
    gnb_acc = cluster_acc(labels_test, label_sample)
    print(title_gnb,"=", gnb_acc)

    # 线性回归
    lr = LinearRegression()
    lr.fit(x_train, labels_train)
    label_sample = lr.predict(x_test)
    label_sample = np.round(label_sample)
    label_sample=label_sample.astype(np.int64)
    lr_acc = cluster_acc(labels_test, label_sample)
    print(title_lr, "=", lr_acc)

    #Logistic regression 需要事先进行标准化
    #创建一对多的逻辑回归对象
    # 标准化特征
    scaler = StandardScaler()
    X_ = scaler.fit_transform(X,labels)
    # 划分训练集和测试集
    x_train, x_test, labels_train, labels_test = \
        train_test_split(X_, labels, test_size=0.2)
    log_reg = LogisticRegression(max_iter=3000)#multinomial
    #训练模型
    log_reg.fit(x_train, labels_train)
    label_sample = log_reg.predict(x_test)
    lsr_acc = cluster_acc(labels_test, label_sample)
    print(title_lsr, "=", lsr_acc)

    return round(knn_acc,n),round(gnb_acc,n),round(lr_acc,n),round(lsr_acc,n)

def get_imgdata(file,sfile,re_size=16,n=5):
    import pickle
    import numpy as np
    from skimage.transform import resize

    def unpickle(file):
        with open(file, 'rb') as f:
            cifar_dict = pickle.load(f, encoding='latin1')
        return cifar_dict
    # 定义用来存放图像数据 图像标签 图像名称list  最后返回的cifar_image cifar_label即是图像cifar-10 对应的数据和标签
    tem_cifar_image = []
    tem_cifar_label = []
    tem_cifar_image_name = []
    for i in range(1, n+1):
        # 存放是你的文件对应的目录
        cifar_file = sfile + str(i)
        cifar = unpickle(cifar_file)
        cifar_label = cifar['labels']
        cifar_image = cifar['data']
        cifar_image_name = cifar['filenames']
        # 使用transpose()函数是因为cifar存放的是图像标准是 通道数 高 宽 所以要修改成  高 宽 通道数
        cifar_image = cifar_image.reshape(10000, 3, 32, 32).transpose(0, 2, 3, 1).astype("float")
        cifar_image = np.asarray([resize(x_img, [re_size, re_size]) for x_img in cifar_image])
        cifar_label = np.array(cifar_label)
        cifar_image_name = np.array(cifar_image_name)
        tem_cifar_image.append(cifar_image)
        tem_cifar_label.append(cifar_label)
        tem_cifar_image_name.append(cifar_image_name)
    cifar_image = np.concatenate(tem_cifar_image)
    cifar_label = np.concatenate(tem_cifar_label)
    cifar_image_name = np.concatenate(tem_cifar_image_name)
    return cifar_image, cifar_label, cifar_image_name

file = "D:\\作品\\python\\cifar-10-python\\cifar-10-batches-py\\batches.meta"
sfile = "D:\\作品\\python\\cifar-10-python\\cifar-10-batches-py\\data_batch_"
n=5              #n表示要获取几个数据集
re_size=8
X = []
Y = []
z = []
X,Y,Z = get_imgdata(file,sfile,re_size,n)
X = X.reshape(n*10000,-1)
X_ = []
Y_ = []
for i in range(n):
    X_.append(X[i*10000:(i+1)*10000])
    Y_.append(Y[i*10000:(i+1)*10000])
X_ = np.array(X_)
Y_ = np.array(Y_)
knn_acc = []
gnb_acc = []
lr_acc = []
lsr_acc = []
for i in range(n):
    t1,t2,t3,t4= \
        knn_gnb_lr_lsr(X_[i], Y_[i])
    knn_acc.append(t1)
    gnb_acc.append(t2)
    lr_acc.append(t3)
    lsr_acc.append(t4)
# 使用pandas输出
title1 = []
for i in range(n):
    t = 'data_bath'+str(i+1)
    title1.append(t)
title2 = ["KNN     ", "Naive Bayes","linear regression","Logistic regression"]
data = pd.DataFrame([knn_acc,gnb_acc,lr_acc,lsr_acc],index=title2,columns=title1)
print(data)

4. 输出
分类性能表

5. 数据集地址
CIFAR-10 python version

SVM算法通过将数据映射到高维空间,将数据分为两个类别。SVM算法的目标是找到一个超平面,可以将数据分为两个类别。SMO算法是一种优化算法,用于求解SVM中的二次规划问题。下面介绍如何使用SMO算法编写SVM对CIFAR-10数据进行分类。 首先,我们需要加载CIFAR-10数据集CIFAR-10数据集包含10个类别的60000个32x32彩色图像。每个类别包含6000个图像。我们将使用Python中的pickle模块来加载数据集。以下是加载数据集的代码: ```python import pickle import numpy as np def unpickle(file): with open(file, 'rb') as fo: dict = pickle.load(fo, encoding='bytes') return dict def load_cifar10_data(): xs = [] ys = [] for j in range(5): d = unpickle('cifar-10-batches-py/data_batch_%d' % (j + 1)) x = d[b'data'] y = d[b'labels'] xs.append(x) ys.append(y) d = unpickle('cifar-10-batches-py/test_batch') xs.append(d[b'data']) ys.append(d[b'labels']) x = np.concatenate(xs) / np.float32(255) y = np.concatenate(ys) return x.reshape((len(x), -1)), np.array(y) ``` 接下来,我们将使用SMO算法来训练SVM模型。以下是使用SMO算法训练SVM模型的代码: ```python class SVM: def __init__(self, C, toler, kernel_opt=('linear', 0)): self.C = C self.toler = toler self.kernel_opt = kernel_opt def fit(self, X, y): n_samples, n_features = X.shape alpha = np.zeros(n_samples) b = 0 kernel = kernel_set[self.kernel_opt[0]] K = np.zeros((n_samples, n_samples)) for i in range(n_samples): K[:, i] = kernel(X, X[i], self.kernel_opt[1]) iter = 0 while iter < max_iter: num_changed_alphas = 0 for i in range(n_samples): Ei = np.dot(alpha * y, K[:, i]) + b - y[i] if (y[i] * Ei < -self.toler and alpha[i] < self.C) or \ (y[i] * Ei > self.toler and alpha[i] > 0): j = np.random.choice([x for x in range(n_samples) if x != i]) Ej = np.dot(alpha * y, K[:, j]) + b - y[j] alpha_i_old, alpha_j_old = alpha[i], alpha[j] if y[i] != y[j]: L = max(0, alpha[j] - alpha[i]) H = min(self.C, self.C + alpha[j] - alpha[i]) else: L = max(0, alpha[i] + alpha[j] - self.C) H = min(self.C, alpha[i] + alpha[j]) if L == H: continue eta = 2.0 * K[i, j] - K[i, i] - K[j, j] if eta >= 0: continue alpha[j] -= y[j] * (Ei - Ej) / eta alpha[j] = min(alpha[j], H) alpha[j] = max(alpha[j], L) if abs(alpha[j] - alpha_j_old) < 1e-5: continue alpha[i] += y[i] * y[j] * (alpha_j_old - alpha[j]) b1 = b - Ei - y[i] * (alpha[i] - alpha_i_old) * K[i, i] - \ y[j] * (alpha[j] - alpha_j_old) * K[i, j] b2 = b - Ej - y[i] * (alpha[i] - alpha_i_old) * K[i, j] - \ y[j] * (alpha[j] - alpha_j_old) * K[j, j] if 0 < alpha[i] < self.C: b = b1 elif 0 < alpha[j] < self.C: b = b2 else: b = (b1 + b2) / 2 num_changed_alphas += 1 if num_changed_alphas == 0: iter += 1 else: iter = 0 self.X = X self.y = y self.kernel = kernel self.alpha = alpha self.b = b def predict(self, X): n_samples, n_features = X.shape K = np.zeros((n_samples, len(self.X))) for i in range(n_samples): K[i, :] = self.kernel(self.X, X[i], self.kernel_opt[1]) y_pred = np.dot(self.alpha * self.y, K) + self.b return np.sign(y_pred) ``` 最后,我们使用以下代码来加载数据集并使用SMO算法训练SVM模型: ```python X, y = load_cifar10_data() y[y == 0] = -1 X_train, X_test = X[:50000], X[50000:] y_train, y_test = y[:50000], y[50000:] svm = SVM(C=1.0, toler=0.001, kernel_opt=('rbf', 1)) svm.fit(X_train, y_train) y_pred_train = svm.predict(X_train) y_pred_test = svm.predict(X_test) train_acc = np.mean(y_train == y_pred_train) test_acc = np.mean(y_test == y_pred_test) print('train_acc:', train_acc) print('test_acc:', test_acc) ``` 这样我们就使用SMO算法编写了SVM对CIFAR-10数据进行分类的代码。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值