复现:A Graph-Based Approach for Active Learning in Regression

 

该方法不需要依赖预测模型。

缺点:计算复杂度太高。运行起来让你崩溃。

'''A Graph-Based Approach for Active Learning in Regression'''
"""
This AL method does not depend on a regression model.
"""
import xlwt
import xlrd
import numpy as np
import pandas as pd
from pathlib import Path
from copy import deepcopy
from time import time
from sklearn.preprocessing import StandardScaler
from skactiveml.pool import CostEmbeddingAL
from skactiveml.utils import MISSING_LABEL
from sklearn.metrics.pairwise import pairwise_distances, pairwise_distances_argmin, pairwise_distances_argmin_min
from collections import OrderedDict


class GALR():
    def __init__(self, X, y, labeled, budget, X_test, y_test):
        self.X = X
        self.y = y
        self.nSample, self.nDim = X.shape
        self.labels = sorted(np.unique(self.y))
        self.nClass = len(self.labels)
        self.X_test = X_test
        self.y_test = y_test
        self.labeled = list(deepcopy(labeled))
        self.n_theta = [i for i in range(self.nClass - 1)]
        self.theta = None
        self.unlabeled = self.initialization()
        self.budget = deepcopy(budget)
        self.budgetLeft = deepcopy(budget)


    def initialization(self):
        unlabeled = [i for i in range(self.nSample)]
        for idx in self.labeled:
            unlabeled.remove(idx)
        return unlabeled

    def select(self):
        while self.budgetLeft > 0:
            Q = OrderedDict()
            for idx in self.unlabeled:
                dist_before = pairwise_distances_argmin_min(self.X[self.unlabeled],self.X[self.labeled],metric="l1")[1]
                tmp_unlabeled = deepcopy(self.unlabeled)
                tmp_unlabeled.remove(idx)
                tmp_labeled = deepcopy(self.labeled)
                tmp_labeled.append(idx)
                dist_after = pairwise_distances_argmin_min(self.X[tmp_unlabeled],self.X[tmp_labeled],metric="l1")[1]
                Q[idx] = np.sum(dist_before)-np.sum(dist_after)
            tar_idx = max(Q, key=Q.get)
            self.budgetLeft -= 1
            self.unlabeled.remove(tar_idx)
            self.labeled.append(tar_idx)




if __name__ == '__main__':


    names_list = ["toy"]

    for name in names_list:
        print("########################{}".format(name))
        data_path = Path(r"D:\OCdata")
        partition_path = Path(r"E:\FFFFF\DataPartitions")
        """--------------read the whole data--------------------"""
        read_data_path = data_path.joinpath(name + ".csv")
        data = np.array(pd.read_csv(read_data_path, header=None))
        X = np.asarray(data[:, :-1], np.float64)
        scaler = StandardScaler()
        X = scaler.fit_transform(X)
        y = data[:, -1]
        y -= y.min()
        nClass = len(np.unique(y))
        Budget = 10 * nClass
        # Budget = 150

        """--------read the partitions--------"""
        read_partition_path = str(partition_path.joinpath(name + ".xls"))
        book_partition = xlrd.open_workbook(read_partition_path)

        """-----read the kmeans results according to the partition-----"""

        workbook = xlwt.Workbook()
        count = 0
        for SN in book_partition.sheet_names():
            S_Time = time()
            train_idx = []
            test_idx = []
            labeled = []
            table_partition = book_partition.sheet_by_name(SN)
            for idx in table_partition.col_values(0):
                if isinstance(idx,float):
                    train_idx.append(int(idx))
            for idx in table_partition.col_values(1):
                if isinstance(idx,float):
                    test_idx.append(int(idx))
            for idx in table_partition.col_values(2):
                if isinstance(idx,float):
                    labeled.append(int(idx))

            X_train = X[train_idx]
            y_train = y[train_idx].astype(np.int32)
            X_test = X[test_idx]
            y_test = y[test_idx]

            model = GALR(X=X_train, y=y_train, labeled=labeled, budget=Budget, X_test=X_test, y_test=y_test)
            model.select()
            # SheetNames = "{}".format(count)
            sheet = workbook.add_sheet(SN)
            for i, idx in enumerate(train_idx):
                sheet.write(i, 0,  int(idx))
            for i, idx in enumerate(test_idx):
                sheet.write(i, 1, int(idx))
            for i, idx in enumerate(labeled):
                sheet.write(i, 2, int(idx))
            for i, idx in enumerate(model.labeled):
                sheet.write(i, 3, int(idx))
            print("SN:",SN," Time:",time()-S_Time)


忘记之前写过了,又写了一遍!

 

"""
ALCS
"""
import os
import numpy as np
import pandas as pd
from copy import deepcopy
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import pairwise_distances
from collections import OrderedDict
from itertools import combinations, product
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

class GB(object):
    def __init__(self, X, y, budget):
        self.X = X
        self.y = y
        self.nSample = len(y)
        self.budgetLeft = deepcopy(budget)
        self.distMatrix = pairwise_distances(X,metric="l1")
        self.labeled = []
        self.unlabeled = list(range(self.nSample))

    def select(self):
        tar_idx = np.random.choice(self.unlabeled, size=1, replace=False)[0]
        self.labeled.append(tar_idx)
        self.budgetLeft -= 1
        self.unlabeled.remove(tar_idx)
        while self.budgetLeft > 0:
            print("剩余预算:", self.budgetLeft)
            unlabeled = deepcopy(self.unlabeled)
            sum_theta = OrderedDict()
            for udx in unlabeled:
                sum_theta[udx] = 0.0
            for udx in unlabeled:
                # print("udx::",udx)
                tmp_unlabeled = deepcopy(unlabeled)
                tmp_unlabeled.remove(udx)
                tmp_labeled = deepcopy(self.labeled)
                tmp_labeled.append(udx)
                for idx in tmp_unlabeled:
                    dist_list = []
                    for jdx in tmp_labeled:
                        dist_list.append(self.distMatrix[idx, jdx])
                    sum_theta[udx] += min(dist_list)

            tar_idx = min(sum_theta, key=sum_theta.get)
            self.labeled.append(tar_idx)
            self.unlabeled.remove(tar_idx)
            self.budgetLeft -= 1




if __name__ == '__main__':
    # --------------------------------------#
    data = np.array(pd.read_csv(r'D:\ExperimentalData\Aggregation\aggregation.csv', header=None))
    # data = np.array(pd.read_csv(r'D:\ExperimentalData\R15\R15.csv', header=None))
    # data = np.array(pd.read_csv(r'D:\ExperimentalData\Jain\Jain.csv', header=None))
    # data = np.array(pd.read_csv(r'D:\ExperimentalData\D31\D31.csv', header=None))
    # data = np.array(pd.read_csv(r'D:\ExperimentalData\Ecoli\ecoli.csv', header=None))
    # data = np.array(pd.read_csv(r'D:\ExperimentalData\Three blobs\ThreeBlobs.csv', header=None))
    # data = np.array(pd.read_csv(r'D:\OCdata\car.csv', header=None))
    # data = np.array(pd.read_csv(r'D:\OCdata\SWD.csv', header=None))
    # data = np.array(pd.read_csv(r'D:\OCdata\newthyroid.csv', header=None))
    # data = np.array(pd.read_csv(r'D:\OCdata\Knowledge.csv', header=None))
    # data = np.array(pd.read_csv(r'D:\OCdata\ESL.csv', header=None))
    # data = np.array(pd.read_csv(r'D:\OCdata\balance-scale.csv', header=None))
    # data = np.array(pd.read_csv(r'D:\OCdata\winequality-red.csv', header=None)) #不好
    # data = np.array(pd.read_csv(r'D:\OCdata\winequality-white.csv', header=None))
    # data = np.array(pd.read_csv(r'D:\OCdata\cleveland.csv', header=None)) #bukeyong
    # data = np.array(pd.read_csv(r'D:\OCdata\automobile.csv', header=None))
    # data = np.array(pd.read_csv(r'D:\OCdata\thyroid2.csv', header=None)) #bukeyong
    # data = np.array(pd.read_csv(r'D:\OCdata\thyroid.csv', header=None)) #bukeyong
    # data = np.array(pd.read_csv(r'D:\OCdata\glass.csv', header=None))
    # data = np.array(pd.read_csv(r'D:\OCdata\toy.csv', header=None))
    # data = np.array(pd.read_csv(r'D:\OCdata\ESL.csv', header=None)) #bukeyong
    # data = np.array(pd.read_csv(r'D:\ExperimentalData\dermatology\dermatology.csv', header=None))

    X = data[:, :-1]
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    y = data[:, -1]
    nClass = len(np.unique(y))
    print(" nClass ==", nClass)
    print(np.unique(y,return_counts=True))
    budget = 20 * nClass
    # budget -= 77


    model = GB(X=X, y=y, budget=budget)
    model.select()
    print(model.labeled)
    print(len(model.labeled),"==",len(set(model.labeled)))
    print("剩余预算:",model.budgetLeft)
    labeled = model.labeled
    tmp = []
    for i,idx in enumerate(labeled):
        tmp.append(idx)
        print(i+1," ",len(set(y[tmp])))

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

DeniuHe

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值