前几个月在看关于矩阵分解的数学知识和斯坦福的机器学习公开课,转两个博客:
1.http://blog.csdn.net/harryhuang1990/article/details/9924377
2.http://blog.csdn.net/wangran51/article/details/7408406
自己用python写了一个矩阵分解的代码,主要是根据第一个博客的思路:
目标函数是要最小化C:
求其梯度:
梯度下降求最小值:
测试数据:
随机梯度下降的实现代码:
import numpy as np import matplotlib.pyplot as plt import random import math class LFM: data_address = '' datasets = [] np_training_datasets = np.zeros(1) decompose_u = np.zeros(1) decompose_v = np.zeros(1) factor = 0 size_training_datasets_x = 0 size_training_datasets_y = 0 alpha = 0.1 iter_num = 20 Lambda = 0.1 epsilon = 0.01 delta_error = [] def __init__(self,data_address,factor,iter_num = 20,alpha = 0.1,Lambda = 0.1,epsilon = 0.01): self.data_address = data_address self.factor = factor self.alpha = alpha self.iter_num = iter_num self.Lambda = Lambda self.epsilon = epsilon def loadData(self): input_file = open(self.data_address,'r') for line in input_file: tmp = line[:-1].split() self.datasets.append([int(i) for i in tmp]) input_file.close() self.np_training_datasets = np.array(self.datasets) def initModel(self): [x,y] = self.np_training_datasets.shape self.size_training_datasets_x = x self.size_training_datasets_y = y self.decompose_u = np.ones([x,self.factor]) self.decompose_v = np.ones([self.factor,y]) def fNormCalc(self,matrix): [x,y] = matrix.shape f_norm = 0 for i in range(x): for j in range(y): f_norm += pow(matrix[i][j],2) f_norm = math.sqrt(f_norm) return f_norm def errorCalc(self): error_sum = 0 for i in range(self.size_training_datasets_x): for j in range(self.size_training_datasets_y): if self.np_training_datasets[i][j] != 0 : eui = 0 for m in range(self.factor): eui += eui + self.decompose_u[i][m] * self.decompose_v[m][j] error_sum += pow(self.np_training_datasets[i][j] - eui,2) + self.Lambda * pow(LFM.fNormCalc(self,self.decompose_u),2) + self.Lambda * pow(LFM.fNormCalc(self,self.decompose_v),2) return error_sum def iterator(self): for step in range(self.iter_num): old_error = 0.5 * LFM.errorCalc(self) for i in range(self.size_training_datasets_x): for j in range(self.size_training_datasets_y): if self.np_training_datasets[i][j] != 0 : for f in range(self.factor): eui = 0 for m in range(self.factor): eui = eui + self.decompose_u[i][m] * self.decompose_v[m][j] self.decompose_u[i][f] += self.alpha * ((self.np_training_datasets[i][j] - eui) * self.decompose_v[f][j] - self.Lambda * self.decompose_u[i][f]) self.decompose_v[f][j] += self.alpha * ((self.np_training_datasets[i][j] - eui) * self.decompose_u[i][f] - self.Lambda * self.decompose_v[f][j]) new_error = 0.5 * LFM.errorCalc(self) if abs(new_error - old_error) < self.epsilon: break self.delta_error.append(abs(new_error - old_error)) if __name__ == "__main__": lfm1 = LFM('c:\\trainingset.txt',3,1000) lfm1.loadData() lfm1.initModel() lfm1.iterator() ex = range(len(lfm1.delta_error)) plt.figure(1) plt.plot(ex,lfm1.delta_error) plt.ylim(0,50) plt.show()
结果显示:
图1.迭代误差
分解所得到的的矩阵:
如果大家发现哪里有错,或者算法可以向哪些方向改进,或者有推荐看的书,请留言一下~~