前几个月在看关于矩阵分解的数学知识和斯坦福的机器学习公开课,转两个博客:
1.http://blog.csdn.net/harryhuang1990/article/details/9924377
2.http://blog.csdn.net/wangran51/article/details/7408406
自己用python写了一个矩阵分解的代码,主要是根据第一个博客的思路:
目标函数是要最小化C:
求其梯度:
梯度下降求最小值:
测试数据:
随机梯度下降的实现代码:
import numpy as np
import matplotlib.pyplot as plt
import random
import math
class LFM:
data_address = ''
datasets = []
np_training_datasets = np.zeros(1)
decompose_u = np.zeros(1)
decompose_v = np.zeros(1)
factor = 0
size_training_datasets_x = 0
size_training_datasets_y = 0
alpha = 0.1
iter_num = 20
Lambda = 0.1
epsilon = 0.01
delta_error = []
def __init__(self,data_address,factor,iter_num = 20,alpha = 0.1,Lambda = 0.1,epsilon = 0.01):
self.data_address = data_address
self.factor = factor
self.alpha = alpha
self.iter_num = iter_num
self.Lambda = Lambda
self.epsilon = epsilon
def loadData(self):
input_file = open(self.data_address,'r')
for line in input_file:
tmp = line[:-1].split()
self.datasets.append([int(i) for i in tmp])
input_file.close()
self.np_training_datasets = np.array(self.datasets)
def initModel(self):
[x,y] = self.np_training_datasets.shape
self.size_training_datasets_x = x
self.size_training_datasets_y = y
self.decompose_u = np.ones([x,self.factor])
self.decompose_v = np.ones([self.factor,y])
def fNormCalc(self,matrix):
[x,y] = matrix.shape
f_norm = 0
for i in range(x):
for j in range(y):
f_norm += pow(matrix[i][j],2)
f_norm = math.sqrt(f_norm)
return f_norm
def errorCalc(self):
error_sum = 0
for i in range(self.size_training_datasets_x):
for j in range(self.size_training_datasets_y):
if self.np_training_datasets[i][j] != 0 :
eui = 0
for m in range(self.factor):
eui += eui + self.decompose_u[i][m] * self.decompose_v[m][j]
error_sum += pow(self.np_training_datasets[i][j] - eui,2) + self.Lambda * pow(LFM.fNormCalc(self,self.decompose_u),2) + self.Lambda * pow(LFM.fNormCalc(self,self.decompose_v),2)
return error_sum
def iterator(self):
for step in range(self.iter_num):
old_error = 0.5 * LFM.errorCalc(self)
for i in range(self.size_training_datasets_x):
for j in range(self.size_training_datasets_y):
if self.np_training_datasets[i][j] != 0 :
for f in range(self.factor):
eui = 0
for m in range(self.factor):
eui = eui + self.decompose_u[i][m] * self.decompose_v[m][j]
self.decompose_u[i][f] += self.alpha * ((self.np_training_datasets[i][j] - eui) * self.decompose_v[f][j] - self.Lambda * self.decompose_u[i][f])
self.decompose_v[f][j] += self.alpha * ((self.np_training_datasets[i][j] - eui) * self.decompose_u[i][f] - self.Lambda * self.decompose_v[f][j])
new_error = 0.5 * LFM.errorCalc(self)
if abs(new_error - old_error) < self.epsilon:
break
self.delta_error.append(abs(new_error - old_error))
if __name__ == "__main__":
lfm1 = LFM('c:\\trainingset.txt',3,1000)
lfm1.loadData()
lfm1.initModel()
lfm1.iterator()
ex = range(len(lfm1.delta_error))
plt.figure(1)
plt.plot(ex,lfm1.delta_error)
plt.ylim(0,50)
plt.show()
结果显示:
图1.迭代误差
分解所得到的的矩阵:
如果大家发现哪里有错,或者算法可以向哪些方向改进,或者有推荐看的书,请留言一下~~
原文地址:http://www.cnblogs.com/Key-Ky/p/3579363.html