Corr-LDA

最新推荐文章于 2021-07-06 12:57:39 发布
Json_Nie
最新推荐文章于 2021-07-06 12:57:39 发布
阅读量2.6k
点赞数
分类专栏： Python
本文链接：https://blog.csdn.net/DreamD1987/article/details/9672349
版权
Python 专栏收录该内容
24 篇文章 0 订阅
订阅专栏
不知道写的对不对。先放到这里，有空了再改改。
'''
Created on Jul 31, 2013

@author: Nie
'''
import numpy

class LDA:
    def __init__(self, K, alpha, beta, docs, visula, V, N, smartinit=True):
        self.K = K
        self.alpha = alpha # parameter of topics prior
        self.beta = beta   # parameter of words prior
        self.docs = docs
        self.visual = visula
        self.V = V

        self.z_m_n = [] # topics of words of documents
        self.n_m_z = numpy.zeros((len(self.docs), K)) + alpha     # word count of each document and topic
        self.n_z_t = numpy.zeros((K, V)) + beta # word count of each topic and vocabulary
        self.n_z = numpy.zeros(K) + V * beta    # word count of each topic

        self.z_m_n_v = [] # topics of words of documents
        self.n_m_z_v = numpy.zeros((len(self.visual), K)) + alpha     # word count of each document and topic
        self.n_z_t_v = numpy.zeros((K, N)) + beta # word count of each topic and vocabulary
        self.n_z_v = numpy.zeros(K) + N * beta
        
        self.N = 0
        self.N_v = 0
        for m, (doc, visual) in enumerate(zip(docs, visula)):
            self.N += len(doc)
            self.N_v += len(visual)
            z_n = []
            z_n_v = []
            
            for t in doc:
                if smartinit:
                    p_z = self.n_z_t[:, t] * self.n_m_z[m] / self.n_z
                    z = numpy.random.multinomial(1, p_z / p_z.sum()).argmax()
                else:
                    z = numpy.random.randint(0, K)
                z_n.append(z)
                self.n_m_z[m, z] += 1
                self.n_z_t[z, t] += 1
                self.n_z[z] += 1
            self.z_m_n.append(numpy.array(z_n))
            
            for t in visual:
                if smartinit:
                    p_z = self.n_z_t_v[:, t] * self.n_m_z_v[m] / self.n_z_v
                    z = numpy.random.multinomial(1, p_z / p_z.sum()).argmax()
                else:
                    z = numpy.random.randint(0, K)
                z_n_v.append(z)
                self.n_m_z_v[m, z] += 1
                self.n_z_t_v[z, t] += 1
                self.n_z_v[z] += 1
            self.z_m_n_v.append(numpy.array(z_n_v))

    def inference(self):
        """learning once iteration"""
        for m, doc in enumerate(self.docs):
            z_n = self.z_m_n[m]
            n_m_z = self.n_m_z[m]
            for n, t in enumerate(doc):
                # discount for n-th word t with topic z
                z = z_n[n]
                n_m_z[z] -= 1
                self.n_z_t[z, t] -= 1
                self.n_z[z] -= 1

                # sampling topic new_z for t
                p_z = self.n_z_t[:, t] * n_m_z / self.n_z
                new_z = numpy.random.multinomial(1, p_z / p_z.sum()).argmax()

                # set z the new topic and increment counters
                z_n[n] = new_z
                n_m_z[new_z] += 1
                self.n_z_t[new_z, t] += 1
                self.n_z[new_z] += 1
        for m, doc in enumerate(self.visual):
            z_n = self.z_m_n_v[m]
            n_m_z = self.n_m_z_v[m]
            for n, t in enumerate(doc):
                # discount for n-th word t with topic z
                z = z_n[n]
                n_m_z[z] -= 1
                self.n_z_t_v[z, t] -= 1
                self.n_z_v[z] -= 1

                # sampling topic new_z for t
                p_z = self.n_z_t_v[:, t] * n_m_z / self.n_z_v
                new_z = numpy.random.multinomial(1, p_z / p_z.sum()).argmax()

                # set z the new topic and increment counters
                z_n[n] = new_z
                n_m_z[new_z] += 1
                self.n_z_t_v[new_z, t] += 1
                self.n_z_v[new_z] += 1

    def worddist(self):
        """get topic-word distribution"""
        return self.n_z_t / self.n_z[:, numpy.newaxis]
    
    def worddist_v(self):
        """get topic-word distribution"""
        return self.n_z_t_v / self.n_z_v[:, numpy.newaxis]

    def perplexity(self, docs=None):
        if docs == None: docs = self.docs
        phi = self.worddist()
        log_per = 0
        N = 0
        Kalpha = self.K * self.alpha
        for m, doc in enumerate(docs):
            theta = self.n_m_z[m] / (len(self.docs[m]) + Kalpha)
            for w in doc:
                log_per -= numpy.log(numpy.inner(phi[:,w], theta))
            N += len(doc)
        return numpy.exp(log_per / N)

def lda_learning(lda, iteration, voca):
    pre_perp = lda.perplexity()
    print "initial perplexity=%f" % pre_perp
    for i in range(iteration):
        lda.inference()
        perp = lda.perplexity()
        print "-%d p=%f" % (i + 1, perp)
        if pre_perp:
            if pre_perp < perp:
                output_word_topic_dist(lda, voca)
                pre_perp = None
            else:
                pre_perp = perp
    output_word_topic_dist(lda, voca)
    
def lda_learning_visula(lda, iteration, voca):
    pre_perp = lda.perplexity()
    print "initial perplexity=%f" % pre_perp
    for i in range(iteration):
        lda.inference()
        perp = lda.perplexity()
        print "-%d p=%f" % (i + 1, perp)
        if pre_perp:
            if pre_perp < perp:
                output_word_topic_dist_visual(lda, voca)
                pre_perp = None
            else:
                pre_perp = perp
    output_word_topic_dist_visual(lda, voca)

def output_word_topic_dist(lda, voca):
    zcount = numpy.zeros(lda.K, dtype=int)
    wordcount = [dict() for k in xrange(lda.K)]
    for xlist, zlist in zip(lda.docs, lda.z_m_n):
        for x, z in zip(xlist, zlist):
            zcount[z] += 1
            if x in wordcount[z]:
                wordcount[z][x] += 1
            else:
                wordcount[z][x] = 1
    
    zcount_v = numpy.zeros(lda.K, dtype=int)
    wordcount_v = [dict() for k in xrange(lda.K)]
    for xlist, zlist in zip(lda.visual, lda.z_m_n_v):
        for x, z in zip(xlist, zlist):
            zcount_v[z] += 1
            if x in wordcount_v[z]:
                wordcount_v[z][x] += 1
            else:
                wordcount_v[z][x] = 1

    phi = lda.worddist()
    phi_v = lda.worddist_v()
    for k in xrange(lda.K):
        print "\n-- topic: %d (%d words)" % (k, zcount[k])
        for w in numpy.argsort(-phi[k])[:20]:
            print "%s: %f (%d)" % (voca[w], phi[k,w], wordcount[k].get(w,0))
        for w in numpy.argsort(-phi_v[k])[:20]:
            print "%d: %f (%d)" % (w, phi_v[k,w], wordcount_v[k].get(w,0))

def output_word_topic_dist_visual(lda, voca):
    zcount = numpy.zeros(lda.K, dtype=int)
    wordcount = [dict() for k in xrange(lda.K)]
    for xlist, zlist in zip(lda.docs, lda.z_m_n):
        for x, z in zip(xlist, zlist):
            zcount[z] += 1
            if x in wordcount[z]:
                wordcount[z][x] += 1
            else:
                wordcount[z][x] = 1

    phi = lda.worddist()
    phi_v = lda.worddist_v()
    for k in xrange(lda.K):
        print "\n-- topic: %d (%d words)" % (k, zcount[k])
        for w in numpy.argsort(-phi[k])[:20]:
            print "%s: %f (%d)" % (voca[w], phi[k,w], wordcount[k].get(w,0))
        for w in numpy.argsort(-phi_v[k])[:20]:
            print "%d: %f (%d)" % (w, phi[k,w], wordcount[k].get(w,0))

def main():
    import optparse
    import vocabulary
    parser = optparse.OptionParser()
    parser.add_option("-f", dest="filename", help="corpus filename")
    parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)")
    parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.5)
    parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.5)
    parser.add_option("-k", dest="K", type="int", help="number of topics", default=20)
    parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=4)
    parser.add_option("-s", dest="smartinit", action="store_true", help="smart initialize of parameters", default=False)
    parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False)
    parser.add_option("--seed", dest="seed", type="int", help="random seed")
    parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0)
    (options, args) = parser.parse_args()
    #if not (options.filename or options.corpus): parser.error("need corpus filename(-f) or corpus range(-c)")

    if 1:
        corpus = vocabulary.load_file('ap.txt')
    else:
        corpus = vocabulary.load_corpus(options.corpus)
        if not corpus: parser.error("corpus range(-c) forms 'start:end'")
    if options.seed != None:
        numpy.random.seed(options.seed)
    
    f = open('visual.txt','r')
    visuallist = []
    for line in f.readlines():
        #visuallist.append(0)
        line = line[:-2]
        temp = line.split(' ')
        #temp.append(0)
        visuallist.append(temp)
    
#     voca = vocabulary.Vocabulary(False)
#     lda = LDA(options.K, options.alpha, options.beta, visuallist, 1001, options.smartinit)
#     print "corpus=%d, words=%d, K=%d, a=%f, b=%f" % (len(corpus), len(voca.vocas), options.K, options.alpha, options.beta)
#     lda_learning_visula(lda, options.iteration, voca)   

    voca = vocabulary.Vocabulary(False)
    docs = [voca.doc_to_ids(doc) for doc in corpus]
    if options.df > 0: docs = voca.cut_low_freq(docs, options.df)
 
    lda = LDA(options.K, options.alpha, options.beta, docs, visuallist, voca.size(), 1001, options.smartinit)
    print "corpus=%d, words=%d, K=%d, a=%f, b=%f" % (len(corpus), len(voca.vocas), options.K, options.alpha, options.beta)
 
    #import cProfile
    #cProfile.runctx('lda_learning(lda, options.iteration, voca)', globals(), locals(), 'lda.profile')
    lda_learning(lda, options.iteration, voca)

if __name__ == "__main__":
    main()