统计学习方法第十七章作业:LSA潜在语义分析算法 代码实现

LSA潜在语义分析算法

import numpy as np
import jieba
import collections

class LSA:
    def __init__(self,text_list):
        self.text_list = text_list
        self.text_num = len(text_list)
        self.get_X()

    def get_X(self):
        self.cuted_text = [jieba.lcut(text,cut_all=True) for text in self.text_list]
        self.word_all = []
        for i in self.cuted_text:
            self.word_all.extend(i)
        self.word_set = list(set(self.word_all))
        self.word_num = len(self.word_set)
        self.word_dict = {}
        for index,word in enumerate(self.word_set):
            self.word_dict[word] = index
        self.X = np.zeros((self.word_num,self.text_num))
        self.word_IDF = {word:0 for word in self.word_set}
        for i in self.cuted_text:
            for word in set(i):
                self.word_IDF[word] += 1
        for i in range(self.text_num):
            count_ = collections.Counter(self.cuted_text[i])
            tf_sum = len(self.cuted_text[i])
            for k, v in count_.items():
                self.X[self.word_dict[k],i] = v/tf_sum * np.log(self.text_num/self.word_IDF[k])

    def norm_w(self):
        norm = 1 / np.array([np.sqrt(np.sum(self.W[:, x] ** 2)) for x in range(self.k)])
        for i in range(self.k):
            self.W[:, i] = self.W[:, i] * norm[i]

    def SVD(self,k):
        u,s,v = np.linalg.svd(self.X)
        return u[:,:k],s[:k].dot(v[:,:k].T)

    def nonegetive(self,k,max_iter,way='MES'):
        self.k = k
        self.W = np.random.random((self.word_num, self.k))
        self.H = np.random.random((self.k, self.text_num))
        if way == 'MES':
            for iter in range(max_iter):
                self.norm_w()
                n_w = (self.X.dot(self.H.T))
                m_w = (self.W.dot(self.H).dot(self.H.T))
                n_h = (self.W.T.dot(self.X))
                m_h = self.W.T.dot(self.W).dot(self.H)
                for j in range(self.k):
                    for i in range(self.word_num):
                        self.W[i,j] = self.W[i,j] * n_w[i,j]/m_w[i,j]
                    for i in range(self.text_num):
                        self.H[j,i] = self.H[j,i] * n_h[j,i]/m_h[j,i]
            self.norm_w()

        if way == 'DIV':
            for iter in range(max_iter):
                self.norm_w()
                W = self.W
                for k_ in range(self.k):
                    for i in range(self.word_num):
                        self.W[i,k_] = self.W[i,k_]*np.sum(self.H[k_].dot(self.X[i])\
                                                        /(self.W.dot(self.H)[i]))/np.sum(self.H[k_])
                for k_ in range(self.k):
                    for j in range(self.text_num):
                        self.H[k_,j] = self.H[k_,j]*np.sum(W[:,k_].dot(self.X[:,j])\
                                                         /(W.dot(self.H)[:,j]))/np.sum(W[:,k_])
            self.norm_w()



def main():
    text_list = [……]
    lsa = LSA(text_list)
    lsa.nonegetive(2,100,way='MES')
    print(lsa.W)
    print(lsa.H)

if __name__ == '__main__':
    main()

散度损失非负矩阵分解算法推导

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值