下面的代码是从知乎一个答主那搬运过来的
import numpy as np
from scipy import linalg
class CCA:
def __init__(self):
self.a = None
self.b = None
def train(self, X, Y):
Nx, cx = X.shape
Ny, cy = Y.shape
# 标准化 (N, C)
X = (X - np.mean(X, 0)) / np.std(X, 0)
Y = (Y - np.mean(Y, 0)) / np.std(Y, 0)
# 求三个S
data = np.concatenate([X, Y], axis = 1)
cov = np.cov(data, rowvar=False)
N, C = cov.shape
Sxx = cov[0:cx, 0:cx]
Syy = cov[cx:C, cx:C]
Sxy = cov[0:cx, cx:C]
Sxx_ = linalg.sqrtm(np.linalg.inv(Sxx))
Syy_ = linalg.sqrtm(np.linalg.inv(Syy))
M = Sxx_.T.dot(Sxy.dot(Syy_))
U, S, V = np.linalg.svd(M, full_matrices=False)
u = U[:, 0]
v = V[0, :]
self.a = Sxx_.dot(u)
self.b = Syy_.dot(v)
def predict(self, X, Y):
X_ = X.dot(self.a)
Y_ = Y.dot(self.b)
return X_, Y_
def cal_corrcoef(self, X, Y):
X_, Y_ = self.predict(X, Y)
return np.corrcoef(X_, Y_)[0,1]
# test
n = 500
# 2 latents vars:
l1 = np.random.normal(size=n)
l2 = np.random.normal(size=n)
latents = np.array([l1, l1, l2, l2]).T
X = latents + np.random.normal(size=4 * n).reshape((n, 4))
Y = latents + np.random.normal(size=4 * n).reshape((n, 4))
X_train = X[:n // 2]
Y_train = Y[:n // 2]
X_test = X[n // 2:]
Y_test = Y[n // 2:]
print (X_train.shape)
(250, 4)
# my cca
clf = CCA()
clf.train(X_train, Y_train)
print (clf.cal_corrcoef(X_train, Y_train))
print (clf.cal_corrcoef(X_test, Y_test))
# 0.7200173442101319
# 0.671638992091987
# compare with sklearn
from sklearn.cross_decomposition import CCA
cca_sklearn = CCA(n_components=1)
cca_sklearn.fit(X_train, Y_train)
X_c, Y_c = cca_sklearn.transform(X_train, Y_train)
X_c_, Y_c_ = cca_sklearn.transform(X_test, Y_test)
print (np.corrcoef(X_c[:,0], Y_c[:, 0])[0,1])
print (np.corrcoef(X_c_[:, 0], Y_c_[:, 0])[0,1])
# 0.7202109376634042
# 0.6719982185448227