# OGA + HDIC + Trim
'''
思路:
算法實現
OGA+ HDIC + Trim
HDIC = HDBIC , HDHQ
1. 進行Kn次的OGA的迭代,其中Kn是OGA的迭代上限
2. 獲得k個回歸因子後,計算高維信息準則HDIC;
3. 選擇OGA迭代結束時可以最小化HDIC的k, 1 <= k <= Kn,
4. 使用Trim獲得最小的相關數據集合
1. Kn計算 1
2. OGA算法實現
3. HDIC公式 1
4. Trim 1
5. choosMink
6. getTrim
Train:
1.
Test:
pre:
cv:
'''
from sklearn.model_selection import KFold
from collections import defaultdict
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.datasets import make_sparse_coded_signal
class OGA_HDIC_Trim:
def __init__(self, df):
self.df = df
self.columns = self.df
self.row, self.col = df.shape
self.sigma = np.inf # 模型當前的標準差
self.newproject = pd.DataFrame() # 選擇出的特征經過投影後的表示
self.x_feat_np = self.df.iloc[:,:-1].to_numpy().astype(np.float32)
self.u = self.df.iloc[:, -1].values.astype(np.float32).reshape((-1, 1)) # 當前的損失
self.coef = pd.DataFrame()
self.coef_Trim = pd.DataFrame() # 剪枝後
self.hdic = []
# self.train()
def MaxIterKn(self, D = 5):
'''
Kn = O (n / log(p))^0.5
取 D (n / log(p))^0.5
:return:
'''
return int(D * (self.row * np.log(self.col)) ** 0.5)
def HDIC(self, IC = 'HDBIC', c=2):
'''
根據w的值不同,我們的信息準則也不同
HDIC = n * log(sigma ^ 2) + #J * wn * logp
n = the numbers of saple
p = the numbers of dimension
sigma : standard deviation---> sigma = 1/ n * (sum_1^n (y_t - y_t_J) ** 2)
#J : 進入模型的樣本數量
:param wn: HDBIC wn = log(n) , wn = c * log(log(n))
:return: HDIC
'''
sigma = np.sum(self.u ** 2) / self.row
fir = self.row * np.log(sigma)
sec = self.newproject.shape[1] * np.log(self.col)
if IC == 'HDBIC':
return fir + sec * np.log(self.row)
elif IC == 'HDHQ':
return fir + sec * c * np.log(np.log(self.row))
def _x_project(self, x_new):
'''
self.newproject = [[x1_, x2_, x3_]]
x_new
x_new = x_new - (x_new * x1) / x1 ** 2 x1
:param X:
:return:
'''
x_new = x_new.values.reshape(self.row, 1)
newproject = self.newproject.values.reshape(self.row, -1)
valprojectX = np.sum(x_new * newproject, axis=0)
valprojextS = np.sum(newproject ** 2, axis=0)
coef = valprojectX / valprojextS
x_project = (np.sum( coef * newproject, axis = 1)).reshape((self.row, -1))
return x_new - x_project
def singlefea(self, x_col):
'''
self.newproject = [[x1_, x2_, x3_]]
x_new
x_new = x_new - (x_new * x1) / x1 ** 2 x1
:param X:
:return:
'''
col = x_col.columns
newproject = x_col.values.reshape(self.row, -1)
valprojectX = np.sum(self.u * newproject, axis=0)
valprojextS = np.sum(newproject ** 2, axis=0)
coef = valprojectX / valprojextS
print(coef)
x_project = (np.sum(coef * newproject, axis=1)).reshape((self.row, -1))
error = np.sum((self.u - x_project) ** 2, axis=0)
minind = np.argmin(error)
return col[minind]
def OGA(self):
'''
1. 根據y_(k+1)_(X) = y_k_(X + \beta_k_J(k+1)_ x_(k+1)_
找到當前最優的x_(k+1)_
2. 根據正交投影計算出最新的特征
x_new_ = x_(k+1)_ - x_project(x_(k+1)_)
一次選擇出一個特征
:return: 返回 x_new_
'''
# 最小二乘法計算每一個特征
X = self.df.drop(self.newproject.columns, axis=1)
col = self.singlefea(X) # 當前最優特征的名稱
# 對特征正交化
if self.newproject.shape == (0, 0):
self.newproject[col] = self.df[col]
else:
self.newproject[col] = self._x_project(self.df[col])
# 計算當前特征對應的coef
new_col = self.newproject[col].values.astype(np.float32).reshape(self.row, 1)
cur_coef = np.sum(self.u * new_col) / np.sum(new_col ** 2)
self.coef[col] = cur_coef # 更新新的係數
self.u -= cur_coef * new_col # 更新新的輸出值
cur_hdic = self.HDIC(IC = 'HDBIC', c=2)
self.hdic.append(cur_hdic) # 計算當前的信息準則 HDIC
def Trim(self, k):
for col in self.newproject.columns[:k+1]: # HDIC(J(k) - j_i) > HDIC(J(K)) 去掉該特征的HDIC的值大於加上該特征,說明該特征效果好,把該特征進行保留
coef_j = self.coef[col]
dropcol = self.newproject[col].values.astype(np.float32).reshape(self.row, 1)
# HBIC -- sigma
sigma = np.sum((self.u + coef_j * dropcol) ** 2) / self.row
hdiclessOne = self.row * np.log(sigma) + (self.newproject.shape[1]-1) * np.log(self.row) * np.log(self.col)
if hdiclessOne > self.hdic[k-1]:
self.coef_Trim[col] = self.coef[col]
def train(self):
#1. 計算最大的模型特征數
Maxiternum = self.MaxIterKn(D=5)
# 選出k個的特征---OGA
for iternum in range(Maxiternum):
self.OGA()
# 選出HDIC最小的模型特征個數
k =np.argmin(np.array(self.hdic)) # min k
# 對選出的最小k再進行剪枝 Trim 剪枝
self.Trim(k)
if __name__ == '__main__':
n_components, n_features = 5000, 100
n_nonzero_coefs = 17
# generate the data
# y = Xw
# |x|_0 = n_nonzero_coefs
y, X1, w = make_sparse_coded_signal(
n_samples=1,
n_components=n_components,
n_features=n_features,
n_nonzero_coefs=n_nonzero_coefs,
random_state=0,
)
df = np.column_stack((X1, y.T))
col = ['a_' + str(i) for i in range(df.shape[1])]
df = pd.DataFrame(df, columns=col)
print(df.shape)
X = df.iloc[:,10].values.reshape((-1, 1)).astype(np.float32)
u = df.iloc[:,-1].values.reshape((-1, 1)).astype(np.float32)
model = OGA_HDIC_Trim(df)
print(model)
print(model.MaxIterKn(D = 2))
print(model.u)
OGA-20230515
最新推荐文章于 2025-05-17 17:07:22 发布