读了很多文章,介绍协同训练时都是说对于某一数据集从不同视图进行协同训练,在学习西瓜书时,书中介绍协同训练并不局限于不同视角,还可以是不同学习器,甚至不同参数,因此本文利用sklearn中的SVM和TREE对相同数据集进行协同训练。
import numpy as np import pandas as pd from sklearn import tree from sklearn import svm from sklearn import preprocessing """对同一组数据集,利用两个分类器进行协同训练""" class co_train: def __init__(self, data, label): """ :param data: 原始数据 :param label: 标签所在列名 """ self.loaddata = data self.label = label # 数据划分为有标签数据和无标签数据 def data_load(self): label_data = self.loaddata[self.loaddata[self.label].isnull() == False] unlabel_data = self.loaddata[self.loaddata[self.label].isnull() == True] return label_data, unlabel_data # 标签类别转换,若为非数字型,转为数字型 def label_change(self, label_data, label_name): """ :param label_data: 有标签数据 :param label_name: 标签数据所在列名 :return: """ label = label_data[label_name].values # 利用OrdinalEncoder进行类别转换 enc = preprocessing.OrdinalEncoder() label_ = enc.fit_transform(label) label_data = label_data.drop(label, axis=1) label_data[label] = label_ return label_data # 模型训练 SVM,DTtree def model_train(self, X, y): svmclf = svm.SVC(probability=True) treeclf = tree.DecisionTreeClassifier() svmclf.fit(X, y) treeclf.fit(X, y) return svmclf, treeclf # 抽取正例评分最高的p个正例和反例评分最高n个反例 def sample_extract(self, X_data, y_data_pro, y_data, p, n): """ :param X_data: 进行预测的无标签数据集 :param y_data: 对两类的预测评分集 :param p: 正例抽取个数 :param n: 反例抽取个数 :return:X_res0, y_res0, X_res1, y_res1, X_data X_res0: 抽取的正例数据 y_res0: 正例抽取个数 X_res1: 抽取的反例数据 y_res1: 反例抽取个数 X_data:抽取后的数据,用作下一轮的无标记数据 """ y_0 = [] y_1 = [] for i in range(len(y_data_pro)): y_0.append(y_data_pro[i][0]) y_1.append(y_data_pro[i][1]) y_0 = np.array(y_0) y_1 = np.array(y_1) X_data = pd.DataFrame(X_data) X_data['y_0'] = y_0 X_data['y_1'] = y_1 X_data['y'] = y_data X_data0 = X_data[X_data['y'] == 0.0] X_data1 = X_data[X_data['y'] == 1.0] # 正例抽取前p行 if len(X_data0) < p: X_res0 = X_data0 X_data0 = X_data0.drop(X_data0.head(len(X_data0)).index, inplace=False) else: X_data0.sort_values(by="y_0", inplace=False, ascending=False) X_res0 = X_data0.head(p) X_data0 = X_data0.drop(X_data0.head(p).index, inplace=False) # 反例抽取前n行 if len(X_data1) < n: X_res1 = X_data1 X_data1 = X_data1.drop(X_data1.head(len(X_data1)).index, inplace=False) else: X_data1.sort_values(by="y_1", inplace=False, ascending=False) X_res1 = X_data1.head(n) X_data1 = X_data1.drop(X_data1.head(n).index, inplace=False) if X_data0 is None: X_data = X_data1 else: X_data = X_data0.append(X_data1) # 转换为训练数据 y_res0 = X_res0['y'].values y_res1 = X_res1['y'].values X_res0 = X_res0.drop(['y_0', 'y_1', 'y'], axis=1) X_res1 = X_res1.drop(['y_0', 'y_1', 'y'], axis=1) X_data = X_data.drop(['y_0', 'y_1', 'y'], axis=1) return X_res0, y_res0, X_res1, y_res1, X_data # dataframe转list def df_list(self, data): if len(data) != 0: data = np.array(data) data = list(data) for i in range(len(data)): if type(data[i]) == float or int: data[i] =data[i] else: data[i] = list(data[i]) else: data = [] return data def run(self, p, n): label_data, unlabel_data = self.data_load() if type(label_data[self.label][0]) is str: label_data = self.label_change(label_data, self.label) y = label_data[self.label].values X = np.array(label_data.drop(self.label, axis=1)) svm_label_data = X tree_label_data = X svm_y = y tree_y = y unlabel_data = unlabel_data.drop(self.label, axis=1) unlabel_data = self.df_list(unlabel_data) svmclf, treeclf = self.model_train(X, y) while len(np.array(unlabel_data)) > 0: y_data_pro_svm = svmclf.predict_proba(unlabel_data) y_data_svm = svmclf.predict(unlabel_data) y_data_pro_tree = treeclf.predict_proba(unlabel_data) y_data_tree = treeclf.predict(unlabel_data) svm_Xres0, svm_yres0, svm_Xres1, svm_yres1, svmX_data = self.sample_extract(unlabel_data, y_data_pro_svm, y_data_svm, p, n) tree_Xres0, tree_yres0, tree_Xres1, tree_yres1, treeX_data = self.sample_extract(unlabel_data, y_data_pro_tree, y_data_tree, p, n) svm_label_data = self.df_list(svm_label_data) + self.df_list(svm_Xres0) + self.df_list(svm_Xres1) svm_y = self.df_list(svm_y) + self.df_list(svm_yres0) + self.df_list(svm_yres1) tree_label_data = self.df_list(tree_label_data) + self.df_list(tree_Xres0) + self.df_list(tree_Xres1) tree_y = self.df_list(tree_y) + self.df_list(tree_yres0) + self.df_list(tree_yres1) unlabel_data = self.df_list(svmX_data) + self.df_list(treeX_data) unlabel_data = pd.DataFrame(unlabel_data) unlabel_data = unlabel_data.drop_duplicates() # 去重 unlabel_data = pd.DataFrame(unlabel_data) unlabel_data = self.df_list(unlabel_data) svmclf = svm.SVC(probability=True) treeclf = tree.DecisionTreeClassifier() svmclf.fit(svm_label_data, svm_y) treeclf.fit(tree_label_data, tree_y) return svmclf, treeclf