python过采样fit参数_KernelADASYN过采样方法-CSDN博客

本文链接：https://blog.csdn.net/weixin_34205814/article/details/113518770

KernelADASYN是一种过采样方法，用于处理类别不平衡问题。它结合了核密度估计和最近邻思想，通过Markov Chain Monte Carlo实现。参数包括比例、邻居数和核带宽。在采样过程中，若协方差矩阵不适定，会进行降维处理。该方法通过生成新的少数类样本来平衡数据集。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

classKernelADASYN(OverSampling):"""Notes:

* The method of sampling was not specified, Markov Chain Monte Carlo has been implemented.

* Not prepared for improperly conditioned covariance matrix."""categories=[OverSampling.cat_density_estimation,

OverSampling.cat_extensive,

OverSampling.cat_borderline]def __init__(self, proportion= 1.0, k= 5, h= 1.0, n_jobs= 1):"""Constructor of the sampling object

Args:

proportion (float): proportion of the difference of n_maj and n_min to sample

e.g. 1.0 means that after sampling the number of minority

samples will be equal to the number of majority samples

k (int): number of neighbors in the nearest neighbors component

h (float): kernel bandwidth

n_jobs (int): number of parallel jobs"""super().__init__()

self.check_greater_or_equal(proportion,"proportion", 0)

self.check_greater_or_equal(k,'k', 1)

self.check_greater(h,'h', 0)

self.check_n_jobs(n_jobs,'n_jobs')

self.proportion=proportion

self.k=k

self.h=h

self.n_jobs=n_jobs

@classmethoddefparameter_combinations(cls):"""Generates reasonable paramter combinations.

Returns:

list(dict): a list of meaningful paramter combinations"""

return cls.generate_parameter_combinations({'proportion': [0.1, 0.25, 0.5, 0.75, 1.0, 1.5, 2.0],'k': [5, 7, 9],'h': [0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0, 2.0, 10.0]})defsample(self, X, y):"""Does the sample generation according to the class paramters.

Args:

X (np.ndarray): training set

y (np.array): target labels

Returns:

(np.ndarray, np.array): the extended training set and target labels"""_logger.info(self.__class__.__name__ + ":" +"Running sampling via %s" %self.descriptor())

self.class_label_statistics(X, y)

num_to_sample=self.number_of_instances_to_sample(self.proportion, self.class_stats[self.majority_label], self.class_stats[self.minority_label])if num_to_sample ==0:

_logger.warning(self.__class__.__name__ + ":" + "Sampling is not needed")returnX.copy(), y.copy()

X_min= X[y ==self.minority_label]#fitting the nearest neighbors model

nn= NearestNeighbors(min([len(X_min), self.k+1]), n_jobs=self.n_jobs)

nn.fit(X)

distances, indices=nn.kneighbors(X_min)#computing majority score

r= np.array([np.sum(y[indices[i][1:]] == self.majority_label) for i inrange(len(X_min))])if np.sum(r > 0) < 2:

_logger.info(self.__class__.__name__ + ":" + "majority score is 0 for all or all but one minority samples")returnX.copy(), y.copy()

r= r/np.sum(r)#kernel density function

defp_x(x):"""Returns minority density value at x

Args:

x (np.array): feature vector

Returns:

float: density value"""result= 1.0/(len(X_min)*self.h)

result= result*(1.0/(np.sqrt(2*np.pi)*self.h)**len(X[0]))return result*np.inner(r, np.exp(-0.5*np.linalg.norm(x - X_min, axis= 1)**2/self.h))#return result*np.sum([r[i]*np.exp(-0.5*np.linalg.norm(x - X_min[i])**2/self.h) for i in range(len(X_min))])

samples=[]

it=0#parameters of the Monte Carlo sampling

burn_in= 1000periods= 50

#covariance is used to generate a random sample in the neighborhood

covariance= np.cov(X_min[r > 0], rowvar=False)if len(covariance) > 1 and np.linalg.cond(covariance) > 10000:

_logger.info(self.__class__.__name__ + ":" + "reducing dimensions due to inproperly conditioned covariance matrix")if len(X[0]) <= 2:

_logger.info(self.__class__.__name__ + ":" + "matrix ill-conditioned")returnX.copy(), y.copy()

n_components= int(np.rint(len(covariance)/2))

pca= PCA(n_components=n_components)

X_trans=pca.fit_transform(X)

ka= KernelADASYN(proportion= self.proportion, k= self.k, h=self.h)

X_samp, y_samp=ka.sample(X_trans, y)returnpca.inverse_transform(X_samp), y_samp#starting Markov-Chain Monte Carlo for sampling

x_old= X_min[np.random.choice(np.where(r >0)[0])]

p_old=p_x(x_old)#Cholesky decomposition

L=np.linalg.cholesky(covariance)while len(samples)

x_new= x_old + np.dot(np.random.normal(size=len(x_old)), L)

p_new=p_x(x_new)

alpha= p_new/p_old

u=np.random.random()if u

x_old=x_new

p_old=p_newelse:passit= it + 1

if it % periods == 0 and it >burn_in:

samples.append(x_old)returnnp.vstack([X, np.vstack(samples)]), np.hstack([y, np.repeat(self.minority_label, len(samples))])defget_params(self):"""Returns:

dict: the parameters of the current sampling object"""

return {'proportion': self.proportion,'k': self.k,'h': self.h,'n_jobs': self.n_jobs}