为什么这么难……这么难……
#encoding:utf-8
import numpy as np
from numpy import *
from numpy import linalg as la
def loadExData():
return[[0, 0, 0, 2, 2],
[0, 0, 0, 3, 3],
[0, 0, 0, 1, 1],
[1, 1, 1, 0, 0],
[2, 2, 2, 0, 0],
[1, 1, 1, 0, 0],
[5, 5, 5, 0, 0]]
def loadExData2():
return[[0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 5],
[0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 3],
[0, 0, 0, 0, 4, 0, 0, 1, 0, 4, 0],
[3, 3, 4, 0, 0, 0, 0, 2, 2, 0, 0],
[5, 4, 5, 0, 0, 0, 0, 5, 5, 0, 0],
[0, 0, 0, 0, 5, 0, 1, 0, 0, 5, 0],
[4, 3, 4, 0, 0, 0, 0, 5, 5, 0, 1],
[0, 0, 0, 4, 0, 4, 0, 0, 0, 0, 4],
[0, 0, 0, 2, 0, 2, 5, 0, 0, 1, 2],
[0, 0, 0, 0, 5, 0, 0, 0, 0, 4, 0],
[1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0]]
'''相似度计算'''
'''1.欧式距离计算法'''
def euclidSim(inA, inB):
return 1.0/(1.0 + la.norm(inA-inB))#归一化处理到0-1之间
'''2.皮尔逊相关系数计算法'''
def pearSim(inA, inB):
if len(inA) < 3: return 1.0
return 0.5+0.5*corrcoef(inA, inB,rowvar = 0)[0][1]#原本皮尔逊相关系数取值范围(-1, 1),归一化处理到0-1之间
'''3.余弦相似度计算法'''
def cosSim(inA, inB):
num = float(inA.T*inB)
denom = la.norm(inA)*la.norm(inB)
return 0.5+0.5*(num/denom)#原本余弦范围(-1, 1),归一化处理到0-1之间
'''基于物品相似度的推荐引擎'''
def standEst(dataMat,user, simMeas, item):
n = shape(dataMat)[1]
simTolal = 0.0#累计相似度
ratSimTotal = 0.0#累计评分
for j in range(n):
uesrRating = dataMat[user,j]
if uesrRating==0: continue
'''就是根据所有用户对商品j和item相似度来估计打分'''
overLap = nonzero(logical_and(dataMat[:, j]>0, dataMat[:,item] > 0))[0]
if len(overLap) == 0: similarity = 0
else:
similarity = simMeas(dataMat[overLap, item],dataMat[overLap, j])
simTolal+=similarity
ratSimTotal += similarity * uesrRating
if simTolal == 0: return 0.0
else :return ratSimTotal/simTolal#通过除以所有评分总和,对上述相似度评分进行乘积的归一化、
'---基于SVD评分估计---'
def svdEst(dataMat,user, simMeas, item):
n = shape(dataMat)[1]
simTotal=0.0;rateSimTotal = 0.0
U,Sigma,VT = la.svd(dataMat)
sig4 = mat(eye(4)*Sigma[:4])#建立对角矩阵
xformedItems = dataMat.T*U[:,:4]*sig4.I#说实话这一步我不明白机器学习实战作者是如何将高维转化到低维的空间
for j in range(n):
userating=dataMat[user,j]
if userating ==0.0 or j == item: continue
similarity = simMeas(xformedItems[j,:].T, xformedItems[item, :].T)
simTotal += similarity
rateSimTotal += similarity*userating
if simTotal == 0: return 0
else: return rateSimTotal/simTotal
'''推荐系统 推荐用户没用过的商品 通过对商品得分的估计来推荐前N个'''
#datMat数据集 user用户 N 推荐个数 simMeas 相似比较方法 estMethod 评价的方法(传统相似度和SVD降维再比较)
def recommend(dataMat, user, N = 3, simMeas=cosSim, estMethod = standEst):
unratedItems = nonzero(dataMat[user,:].A == 0)[1]
if len(unratedItems) == 0: #如果没有 没评价的
return 'you rated everything'
itemSocre=[]
for item in unratedItems:#对每个没评价的进行打分
estimateSocre = estMethod(dataMat,user, simMeas, item)
itemSocre.append((item,estimateSocre))
return sorted(itemSocre, key=lambda jj:jj[1], reverse=True)[:N]
'''对图形进行压缩'''
#打印图形函数 32*32的图像
def printMat(inMat, thresh = 0.8):
for i in range(32):
for j in range(32):
if float(inMat[i,j] > thresh):
print 1,
else:
print 0,
print ''
#压缩图形函数
def imgCompress(numSV = 3,thresh = 0.8 ):
myl=[]
for line in open('0_5.txt'):
newRow = []
for i in range(32):
newRow.append(int(line[i]))
myl.append(newRow)
mydat = mat(myl)
print "-------original matrix------"
printMat(mydat,thresh)
U,Sigma, VT = la.svd(mydat)
SigRecon = mat(eye(numSV)*Sigma[:numSV])
reconMat =U[:,:numSV]*SigRecon*VT[:numSV, :]
print '-------reconstructed matrix using %d singular values----------' % numSV
'''U vT 都是32*2的矩阵,有两个奇异值。因此总数子数目就是64+64+2=130 原本是需要存32*32=1024 我们压缩了近10倍
虽然压缩后图形与有差别,但是很接近了。终于体会到SVD的好处了,这就是数学的力量。哇 我好垃圾
'''
printMat(reconMat,thresh)
return
if __name__ == '__main__':
# dataMat = mat(loadExData())
# print euclidSim(dataMat[:,0],dataMat[:,4])
# print pearSim(dataMat[:,0],dataMat[:,4])
# print cosSim(dataMat[:,0],dataMat[:,4])
# print '---基于物品相似度的推荐引擎---'
# dataMat[0,1] = dataMat[0,0] = dataMat[1,0] = dataMat[2,0] = 4
# dataMat[3,3] = 2
# print dataMat
# print recommend(dataMat, 2)
# print 'SVD评分估计'
# datMat = mat(loadExData2())
# print datMat
# print recommend(datMat, 1,estMethod=svdEst)
print"----------压缩图像-----------"
imgCompress(numSV = 3,thresh = 0.8 )