《机器学习实战》第十四章：利用SVD简化数据全部代码（含注释）

white________

已于 2022-10-09 16:59:21 修改

阅读量721

点赞数 1

文章标签：机器学习 python 人工智能

于 2022-10-09 16:58:07 首次发布

本文链接：https://blog.csdn.net/white________/article/details/127229913

版权

def loadExData():
    return[[0, 0, 0, 2, 2],
           [0, 0, 0, 3, 3],
           [0, 0, 0, 1, 1],
           [1, 1, 1, 0, 0],
           [2, 2, 2, 0, 0],
           [5, 5, 5, 0, 0],
           [1, 1, 1, 0, 0]]

def loadExData2():
    return[[0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 5],
           [0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 3],
           [0, 0, 0, 0, 4, 0, 0, 1, 0, 4, 0],
           [3, 3, 4, 0, 0, 0, 0, 2, 2, 0, 0],
           [5, 4, 5, 0, 0, 0, 0, 5, 5, 0, 0],
           [0, 0, 0, 0, 5, 0, 1, 0, 0, 5, 0],
           [4, 3, 4, 0, 0, 0, 0, 5, 5, 0, 1],
           [0, 0, 0, 4, 0, 4, 0, 0, 0, 0, 4],
           [0, 0, 0, 2, 0, 2, 5, 0, 0, 1, 2],
           [0, 0, 0, 0, 5, 0, 0, 0, 0, 4, 0],
           [1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0]]

# 14-1 相似度计算
from numpy import *
from numpy import linalg as la

# 三种相似度计算的计算方法
# inA,inB:列向量
# 相似度值在0到1之间变化，并且物品对越相似，它们的相似度值也就越大

# 第一种方法:欧式距离:使用 相似度=1/(1+距离) 计算相似度
def ecludSim(inA, inB):
    return 1.0/(1.0 + la.norm(inA - inB))

# 第二种方法:皮尔逊相关系数 0.5+0.5*corrcoef() 并且把其取值范围归一化到0到1之间
def pearsSim(inA, inB):
    if len(inA) < 3 :return 1.0
    return 0.5 + 0.5 * corrcoef(inA, inB, rowvar=0)[0][1]

# 第三种方法:余弦相似度(其计算的是两个向量夹角的余弦值。如果夹角为90度，则相似度为0;如果两个向量的方向相同，则相似度为1.0)
# 余弦相似度的取值范围也在-1与+1之间，因此我们也将它归一化到0到1之间。
# 范数的计算方法linalg.norm()
def cosSim(inA, inB):
    num = float(inA.T * inB) # .T:矩阵的转置
    denom = la.norm(inA) * la.norm(inB)
    return 0.5 + 0.5 * (num / denom)

# 14-2 基于物品相似度的推荐引擎
# 用来计算在给定相似度计算方法的条件下，用户对物品的估计评分值
def standEst(dataMat, user, simMeas, item):
    # 数据矩阵 用户编号 未评分的物品编号 相似度计算方法
    # 矩阵行对应用户，列对应物品
    n = shape(dataMat)[1] # 得到数据集中的物品数目
    simTotal = 0.0; ratSimTotal = 0.0 # 对后面两个用于计算估计评分值的变量进行初始化
    for j in range(n): # 遍历行中每一个物品
        userRating = dataMat[user, j]
        # 如果某个物品的评分值为0，则跳过这个物品
        if userRating == 0:
            continue
        # 寻找两个用户都评级的物品
        # overLap给出两个物品当中已经被评分的那个元素的索引ID
        # logical_and:计算x1和x2元素的真值
        overLap = nonzero(logical_and(dataMat[:,item].A>0,\
                                      dataMat[:,j].A>0))[0]
        # 如果相似度为0，则两者没有任何重复元素，终止本次循环
        if len(overLap) == 0:
            similarity = 0
        # 如果存在重合的物品，则基于这些重合物重新计算相似度
        else:
            similarity = simMeas(dataMat[overLap,item],\
                                   dataMat[overLap,j])
        print("the %d and %d similarity is: %f" % (item, j , similarity))
        # 相似度会不断累加，每次计算时还考虑相似度和当前用户评分的乘积
        # similarity 用户相似度， userRating 用户评分
        simTotal += similarity
        ratSimTotal += similarity * userRating
    if simTotal == 0:
        return 0
    # 通过除以所有的评分总和，对上述相似度评分的乘积进行归一化，使得最后评分在0-5之间，这些评分用来对预测值进行排序
    else:
        return ratSimTotal / simTotal # 返回评分(0-5之间)

# 推荐引擎 默认调用standEst函数，产生了最高的N个推荐结果
# 如果不指定N的大小，则默认值为3。该函数另外的参数还包括相似度计算方法和估计方法
def recommend(dataMat, user, N=3, simMeas=cosSim, estMethod=standEst):
    # 寻找未评级的物品
    # 对给定的用户建立一个未评分的物品列表
    unratedItems = nonzero(dataMat[user,:].A==0)[1] # .A:将矩阵转化为数组
    # 如果不存在未评分物品，那么就退出函数
    if len(unratedItems) == 0:
        return "you rated everything"
    # 物品的编号和评分值
    itemScores = []
    # 在未评分物品上进行循环
    for item in unratedItems:
        estimatedScore = estMethod(dataMat, user, simMeas, item)
        # 寻找前N个未评级物品，调用standEst()来产生该物品的预测得分，该物品的编号和估计值会放在一个元素列表itemScores中
        itemScores.append((item, estimatedScore))
    # 按照估计得分，对该列表进行排序并返回。列表逆排序，第一个值就是最大值
    return sorted(itemScores, key=lambda jj:jj[1], reverse=True)[:N]

# 14-3 基于SVD的评分估计
# 计算某用户未评分物品中，以对该物品和其他物品评分的用户的物品相似度，然后进行综合评分
def svdEst(dataMat, user, simMeas, item):
    # 训练数据集 用户编号 相似度计算方法 未评分的物品编号
    n = shape(dataMat)[1]
    # 对数据集进行SVD分解
    simTotal = 0.0; ratSimTotal = 0.0
    # 奇异值分解
    # 在SVD分解之后，我们只利用包含了90%能量值的奇异值，这些奇异值会以NumPy数组形式得以保存
    U,Sigma,VT = la.svd(dataMat)
    # 如果要进行矩阵运算，就必须要用这些奇异值构建出一个对角矩阵
    Sig4 = mat(eye(4)*Sigma[:4])
    # 这和standEst()函数中的for循环目的一样，只不过这里的相似度计算是在低维空间下进行的
    xformedItems = dataMat.T * U[:,:4] * Sig4.I
    for j in range(n):
        userRating = dataMat[user,j]
        if userRating == 0 or j == item:
            continue
        # 相似度的计算方法也会作为一个参数传递给该函数
        similarity = simMeas(xformedItems[item,:].T,\
                             xformedItems[j,:].T)
        print("the %d and %d similarity is: %f" % (item, j, similarity))
        # 对相似度不断累加求和
        simTotal += similarity
        # 对相似度及对应评分值的乘积求和
        ratSimTotal += similarity * userRating
    if simTotal == 0:
        return 0
    else:
        # 计算估计评分
        return ratSimTotal / simTotal

# 14-4 图像压缩函数
# 打印矩阵函数
def printMat(inMat, thresh=0.8):
    # 由于矩阵保护了浮点数，因此定义浅色和深色，遍历所有矩阵元素，当元素大于阀值时打印1，否则打印0
    for i in range(32):
        for k in range(32):
            if float(inMat[i,k]) > thresh:
                print("1",end="")
            else:
                print("0",end="")
        print('')

def imgCompress(numSV=3, thresh=0.8):
    # Sigma长度 判断的阈值
    myl = []
    for line in open('0_5.txt').readlines():
        newRow = []
        for i in range(32):
            newRow.append(int(line[i]))
        myl.append(newRow)
    myMat = mat(myl)
    print("****original matrix****")
    printMat(myMat, thresh)
    # 通过Sigma 重新构成SigRecom来实现
    # Sigma是一个对角矩阵，因此需要建立一个全0矩阵，然后将前面的那些奇异值填充到对角线上
    U,Sigma,VT = la.svd(myMat)
    SigRecon = mat(zeros((numSV, numSV)))
    for k in range(numSV):
        SigRecon[k,k] = Sigma[k]
    reconMat = U[:,:numSV] * SigRecon * VT[:numSV,:]
    print("****reconstructed matrix using %d singular values****" % numSV)
    printMat(reconMat, thresh)

以下为测试代码部分

from numpy import *
# 一旦看到Sigma就要知道它是一个矩阵
# linalg:线性代数工具箱
# U,Sigma,VT = linalg.svd([[1, 1],[7, 7]])
#
# print(U)
# print(Sigma)
# print(VT)

# SVD:奇异值分解
# 对矩阵进行SVD分解
import svdRec
Data = svdRec.loadExData()
U,Sigma,VT = linalg.svd(Data)
# print(Sigma)

# 构建一个3x3的矩阵
Sig3 = mat([[Sigma[0], 0, 0], [0, Sigma[1], 0], [0, 0, Sigma[2]]])
# 只需使用矩阵U的前三列和VT的前三行
# print(U[:,:3] * Sig3 * VT[:3,:])

myMat = mat(svdRec.loadExData())
# 欧式距离
print("欧氏距离:")
print(svdRec.ecludSim(myMat[:,0],myMat[:,4]))
print(svdRec.ecludSim(myMat[:,0],myMat[:,0]))

# 余弦相似度
print("余弦相似度:")
print(svdRec.cosSim(myMat[:,0],myMat[:,4]))
print(svdRec.cosSim(myMat[:,0],myMat[:,0]))

# 皮尔逊相关系数
print("皮尔逊相关系数:")
print(svdRec.pearsSim(myMat[:,0],myMat[:,4]))
print(svdRec.pearsSim(myMat[:,0],myMat[:,0]))

myMat = mat(svdRec.loadExData())
myMat[0,1] = myMat[0,0] = myMat[2,0] = 4
myMat[3,3] = 2
print("myMat:")
print(myMat)

print("推荐算法：")
print(svdRec.recommend(myMat, 2))
print(svdRec.recommend(myMat, 2, simMeas=svdRec.ecludSim))
print(svdRec.recommend(myMat, 2, simMeas=svdRec.pearsSim))

from numpy import linalg as la
U,Sigma,VT = la.svd(mat(svdRec.loadExData2()))
# 看有多少个奇异值能达到总能量的90%
print("Sigma:")
print(Sigma)
# 求平方
Sig2 = Sigma**2

# 计算总能量
print("sum(Sig2):")
print(sum(Sig2))

# 计算总能量的90%
print("sum(Sig2)*0.9:")
print(sum(Sig2)*0.9)

# 计算前两个元素所包含的能量
print("sum(Sig2[:2]):")
print(sum(Sig2[:2]))

# 上值低于总能量的90%，于是计算前三个元素所包含的能量
print("sum(Sig2[:3]):")
print(sum(Sig2[:3]))

myMat = mat(svdRec.loadExData2())
# 使用默认相似度计算方法
print("使用默认相似度计算方法")
print(svdRec.recommend(myMat, 1, estMethod=svdRec.svdEst))
# 使用另一种相似度计算方法
print("使用另一种相似度计算方法")
print(svdRec.recommend(myMat, 1, estMethod=svdRec.svdEst,simMeas=svdRec.pearsSim))

svdRec.imgCompress(2)