写了一个简单的多类分类程序,给定一个数据集,在其上做10-fold交叉检验,输出loss,以及分类的结果。
最关键的函数是def do_cross_validation(dict_feature_list, y_list, num_features, num_fold, clf)。各个参数的含义是
dict_feature_list是一个python的dict列表,列表中每一个元素代表一个样本,比如一个文档,dict作为,k代表特征,v是特征的值。
y_list是样本的标签
num_features是数据集的维度大小
num_fold是几次交叉检验,10则代表10-fold交叉检验
clf是分类算法
做交叉检验时,关键代码是skf = StratifiedKFold(y, n_folds=num_fold, shuffle=False, random_state=None),这个方法将根据类别的分布情况,对数据集做stratified分隔,尽量使得每个fold里的类别分布与原始数据集相同。毕竟,机器学习train出来的model假设做分类时,所面对的数据和训练数据有同样的分布才好。通过一个for循环,for i, (train_index, test_index) in enumerate(skf):将十次交叉检验的数据展开。代码最后用了metrix计算loss,当然可以偷懒直接用classification_report,我这里根据micro、macro、weighted三种方式,计算了下precision,recall和f1-score。
函数def make_2d_matrix_to_dict_lst(matrix)完全是为了测试代码,作用是将一个dense的矩阵,变成dict的列表。
函数def dict_lst_to_coo_sparse_matrix(dict_lst, num_features):是将一个dict的列表,转成sparse matrix,这样可以很大幅度的节约内存,尤其是在做文本分类的时候。
具体用的时候,偷懒用了one-vs-rest的多分类策略,基础算法使用的逻辑回归clf = OneVsRestClassifier(LogisticRegression())
# -*- coding: utf-8 -*-
from sklearn.cross_validation import StratifiedKFold
from sklearn import metrics
import numpy as np
from sklearn.multiclass import OneVsRestClassifier
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
#transfer a python dict list to scipy COO sparse matrix
#dict_lst: [{a:b},{a:b,c:d}], each dict is the feature set of an instance
#num_features: the total number of features in dataset
def dict_lst_to_coo_sparse_matrix(dict_lst, num_features):
from scipy.sparse import coo_matrix
import numpy as np
n_doc = len(dict_lst)
#find non-zero element
row_vec = []
col_vec = []
data_vec = []
for d_index in range(len(dict_lst)):
for k in dict_lst[d_index]:
row_vec.append(d_index)
col_vec.append(k)
data_vec.append(dict_lst[d_index][k])
row_vec = np.array(row_vec)
col_vec = np.array(col_vec)
data_vec = np.array(data_vec)
return coo_matrix((data_vec, (row_vec, col_vec)), shape=(n_doc, num_features))
#transfer a dense 2d matrix to dict lst
def make_2d_matrix_to_dict_lst(matrix):
lst = []
for row in matrix:
d = {}
for j in range(len(row)):
if row[j] != 0:
d[j] = row[j]
lst.append(d)
return lst
#base experimental code
def do_cross_validation(dict_feature_list, y_list, num_features, num_fold, clf):
X = dict_feature_list#instance set
y = np.array(y_list)#label set
ids = np.arange(len(X))#instance id set
id2result = {}
loss_lst = []
predicted_lst = []
#make cross validation set
skf = StratifiedKFold(y, n_folds=num_fold, shuffle=False, random_state=None)
for i, (train_index, test_index) in enumerate(skf):
#split dataset into train and test
y_train = y[train_index]
id_train = ids[train_index]
X_train = []
for t in train_index:
X_train.append(X[t])
y_test = y[test_index]
id_test = ids[test_index]
X_test = []
for t in test_index:
X_test.append(X[t])
#make sparse representation
sparse_X_train = dict_lst_to_coo_sparse_matrix(X_train, num_features)
sparse_X_test = dict_lst_to_coo_sparse_matrix(X_test, num_features)
#train a classifier on the training set
clf.fit(sparse_X_train, y_train)
#do prediction on the test set
predicted_labels = clf.predict(sparse_X_test)
#store results for later comparision
for index in range(len(id_test)):
id2result[id_test[index]] = (y_test[index], predicted_labels[index])
#compute loss
macro_pr = metrics.precision_score(y_test, predicted_labels, pos_label=None, average='macro')
macro_re = metrics.recall_score(y_test, predicted_labels, pos_label=None, average='macro')
macro_f1 = metrics.f1_score(y_test, predicted_labels, pos_label=None, average='macro')
micro_pr = metrics.precision_score(y_test, predicted_labels, pos_label=None, average='micro')
micro_re = metrics.recall_score(y_test, predicted_labels, pos_label=None, average='micro')
micro_f1 = metrics.f1_score(y_test, predicted_labels, pos_label=None, average='micro')
weighted_pr = metrics.precision_score(y_test, predicted_labels, pos_label=None, average='weighted')
weighted_re = metrics.recall_score(y_test, predicted_labels, pos_label=None, average='weighted')
weighted_f1 = metrics.f1_score(y_test, predicted_labels, pos_label=None, average='weighted')
loss_lst.append((macro_pr, macro_re, macro_f1, micro_pr, micro_re, micro_f1, weighted_pr, weighted_re, weighted_f1))
return loss_lst, id2result
#load digital recognition dataset
digits = datasets.load_digits()
X = digits.data
y = digits.target
num_features = len(X[0])
#make dict lst features
feature_lst = make_2d_matrix_to_dict_lst(X)
clf = OneVsRestClassifier(LogisticRegression())
loss_lst, id2result = do_cross_validation(feature_lst, y, num_features, 10, clf)
for loss in loss_lst:
print ['%.3f' % r for r in loss]