lda 文本情感识别

最新推荐文章于 2024-09-06 23:59:49 发布
csu_小王子
最新推荐文章于 2024-09-06 23:59:49 发布
阅读量1.2k
点赞数 1
分类专栏：编程文章标签： lda
本文链接：https://blog.csdn.net/lsm424/article/details/88671148
版权
编程专栏收录该内容
10 篇文章 1 订阅
订阅专栏
# encoding=utf-8
import jieba
import random
import xml.etree.ElementTree as ET
import xml.dom.minidom as mindom
from gensim import corpora
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
from gensim.corpora import Dictionary
from sklearn import datasets
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import *
import numpy as np
import os
import time
import copy
import math

'''读取数据类'''


class ReadData():
    '''
        SrcClassfiedTrainFile：训练集文本文件名list
        ImplictFile：隐式文本文件名
    '''
    
    def __init__(self, SrcClassfiedTrainFile, ImplictFile):
        stopwords = self.ReadStopword()
        '''读取带标签的文本'''
        self.TrainDoc = []  # 训练集，一行一个一条句子，句子为分词后的list，相当于二位矩阵
        self.Label = []  # 标签
        if os.path.exists('train.txt'):
            for line in open('train.txt', 'r').readlines():
                units = line.split('|')
                self.TrainDoc.append(units[:-1])
                self.Label.append(units[-1].strip())
            print set(self.Label)
        else:
            for file in SrcClassfiedTrainFile:
                tree = ET.parse(file)
                for sentence in tree.iter('sentence'):
                    if not sentence.attrib.has_key('emotion-1-type'):
                        continue
                    sentence_out = self.SegSentence(sentence.text, stopwords)
                    # print sentence.text, '----', '/'.join(sentence_out)
                    self.TrainDoc.append(sentence_out)
                    self.Label.append(sentence.attrib['emotion-1-type'])
            with open('train.txt', 'w+') as f:
                for i in range(len(self.TrainDoc)):
                    f.write('%s|%s\n' % ('|'.join(self.TrainDoc[i]), self.Label[i]))
        
        '''读取隐式文本'''
        self.implictword = []  # 隐式文本，一行一个一条句子，句子为分词后的list，相当于二位矩阵
        if os.path.exists('implicit.txt'):
            for line in open('implicit.txt', 'r').readlines():
                self.implictword.append([implicit.strip() for implicit in line.split('|')])
        else:
            for sentence in open(ImplictFile, 'r').readlines():
                sentence_out = self.SegSentence(sentence, stopwords)
                print sentence, '----', '/'.join(sentence_out)
                self.implictword.append(sentence_out)
            with open('implicit.txt', 'w+') as f:
                for i in range(len(self.implictword)):
                    f.write('%s|\n' % '|'.join(self.implictword[i]))
    
    def GetData(self, part=1.0):
        TestNum = int(len(self.TrainDoc) * 0.1)
        TestData = self.TrainDoc[int(TestNum * 0.1):][:TestNum]
        TestLabel = self.Label[int(TestNum * 0.1):][:TestNum]
        
        '''
        for i in range(TestNum):
            idx = random.randrange(0, len(self.TrainDoc))
            TestData.append(self.TrainDoc.pop(idx))
            TestLabel.append(self.Label.pop(idx))
        '''
        TrainNum = int(len(self.TrainDoc) * 0.9 * part)
        TainData = self.TrainDoc[:int(len(self.TrainDoc) * 0.9)][:TrainNum]
        TranLabel = self.Label[:int(len(self.TrainDoc) * 0.9)][:TrainNum]
        
        '''
        TrainCnt = int(len(self.TrainDoc) * part)
        for i in range(TrainCnt):
            idx = random.randrange(0, len(self.TrainDoc))
            TainData.append(self.TrainDoc.pop(idx))
            TranLabel.append(self.Label.pop(idx))
        '''
        implictword = []
        TrainCnt = int(len(self.implictword) * part)
        for i in range(TrainCnt):
            # idx = random.randrange(0, len(self.implictword))
            implictword.append(self.implictword[i])
        
        return TainData, TranLabel, TestData, TestLabel, implictword
    
    '''结巴分词，并去除停顿词、非中文字符、空字符'''
    def SegSentence(self, sentenct, stopwords):
        sentence_part = jieba.cut(sentenct.strip(), cut_all=False)  # 句子分词
        sentence_out = [self.CleanChars(word).encode('utf8') for word in sentence_part if
                        word not in stopwords and self.CleanChars(word) != '']  # 去除停顿词
        return sentence_out
    
    '''# 读取停顿词'''
    def ReadStopword(self):
        stopwords = set()
        with open('./chineseStopWords.txt', 'r') as f:
            for line in f.readlines():
                stopwords.add(line.strip())
        return stopwords
    
    '''去除非中文字符'''
    def CleanChars(self, string):
        out = ''
        for uchar in string:
            if uchar >= u'\u4e00' and uchar <= u'\u9fa5':
                out += uchar
        return out


class MyLDASVM:
    '''Doc： doc-word矩阵'''
    def __init__(self, Doc, Label, iteration, alpha, beta=0.01, topics=50):
        self.doc = Doc
        self.lab = Label
        self.alpha = alpha
        self.beta = beta
        self.topics = topics
        self.iteration = iteration
        self.clf = GridSearchCV(SVC(), {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'C': [1, 10, 100, 1000], 'gamma': [0.125, 0.25, 0.5, 1, 2]}, n_jobs=2)
    
    # 生成LDA模型
    def GenLDA(self, useGridSearchCV=False):
        self.dictionary = Dictionary(self.doc)
        self.corpus = [self.dictionary.doc2bow(text) for text in self.doc]
        self.lda = LdaModel(corpus=self.corpus, num_topics=self.topics, iterations=self.iteration)
        self.doc2probabily = []  # 文档到主题概率的二维矩阵
        for corpus in self.corpus:
            self.doc2probabily.append(self.getTopicProbability(self.lda.get_document_topics(corpus)))
        if not useGridSearchCV:
            self.mysvm = OneVsRestClassifier(SVC(kernel='poly', C=164, gamma=0.3)).fit(np.array(self.doc2probabily), np.array(self.lab))
        else:
            print 'start'
            self.clf.fit(np.array(self.doc2probabily[:60]), np.array(self.lab[:60]))
            print(self.clf.best_params_)
            self.mysvm = OneVsRestClassifier(SVC(kernel=self.clf.best_params_['kernel'], C=self.clf.best_params_['C'],
                                                 gamma=self.clf.best_params_['gamma'])).fit(
                np.array(self.doc2probabily), np.array(self.lab))
            print self.mysvm
        self.lda.save('mylda')
    
    def UpDateLda(self, Doc, Label):
        if len(Doc) != len(Label):
            return
        self.doc.extend(Doc)
        self.lab.extend(Label)
        self.GenLDA()
    
    def SVMPredict(self, data, label, testdata):
        doc_bow = self.dictionary.doc2bow(testdata)
        doc_lda = self.lda[doc_bow]
        topicsprobality = np.array([np.array(self.getTopicProbability(doc_lda))])
        # ret = OneVsRestClassifier(SVC()).fit(np.array(data), np.array(label)).predict(topicsprobality).tolist()
        ret = self.mysvm.predict(topicsprobality).tolist()
        return ret[0]
    
    '''将【（id, probability）】对转成topics个元素的向量'''
    def getTopicProbability(self, doc2topics):
        topicsprobality = [0.0] * self.topics
        for (id, probality) in doc2topics:
            topicsprobality[id] = probality
        return topicsprobality
    
    def perplexity(self, testset):
        """calculate the perplexity of a lda-model"""
        print ('num of testset: %s; len(self.dictionary.keys()): %s; num of topics: %s' % (
            len(testset), len(self.dictionary.keys()), self.topics))
        testset = []
        for i in range(len(self.corpus) / 50):
            testset.append(self.corpus[i * 50])
        prob_doc_sum = 0.0
        topic_word_list = []  # store the probablity of topic-word:[(u'business', 0.010020942661849608),(u'family', 0.0088027946271537413)...]
        for topic_id in range(self.topics):
            topic_word = self.lda.show_topic(topic_id, len(self.dictionary.keys()))
            dic = {}
            for word, probability in topic_word:
                dic[word] = probability
            topic_word_list.append(dic)
        doc_topics_ist = []  # store the doc-topic tuples:[(0, 0.0006211180124223594),(1, 0.0006211180124223594),...]
        for doc in testset:
            doc_topics_ist.append(self.lda.get_document_topics(doc, minimum_probability=0))
        testset_word_num = 0
        for i in range(len(testset)):
            prob_doc = 0.0  # the probablity of the doc
            doc = testset[i]
            doc_word_num = 0  # the num of words in the doc
            for word_id, num in doc:
                prob_word = 0.0  # the probablity of the word
                doc_word_num += num
                # word = self.dictionary[word_id]
                for topic_id in range(self.topics):
                    # cal p(w) : p(w) = sumz(p(z)*p(w|z))
                    prob_topic = doc_topics_ist[i][topic_id][1]
                    prob_topic_word = topic_word_list[topic_id][str(word_id)]
                    prob_word += prob_topic * prob_topic_word
                prob_doc += math.log(prob_word)  # p(d) = sum(log(p(w)))
            prob_doc_sum += prob_doc
            testset_word_num += doc_word_num
        prep = math.exp(-prob_doc_sum / testset_word_num)  # perplexity = exp(-sum(p(d)/sum(Nd))
        print ("模型的困惑度为s : %s" % prep)
        return prep


def Train(TrainDoc, TrainLabel, implictword, iteration, alpha, beta=0.01, topics=25):
    MyLdaSvm = MyLDASVM(TrainDoc, TrainLabel, iteration, alpha, beta, topics)
    MyLdaSvm.GenLDA()
    implicit_label = []
    print '开始训练%d条隐式文本' % len(implictword)
    for idx, implicit_doc in enumerate(implictword):
        startime = int(round(time.time() * 1000))
        label = MyLdaSvm.SVMPredict(MyLdaSvm.doc2probabily, MyLdaSvm.lab, implicit_doc)
        implicit_label.append(label)
        print '训练第%d隐式文本，原文本分词："%s"，标签：%s, 耗时：%d毫秒' % (
            idx, '/'.join(implicit_doc), label, int(round(time.time() * 1000)) - startime)
    MyLdaSvm.UpDateLda(implictword, implicit_label)
    return MyLdaSvm


datakey = {'like': [0, 0, 0, 0],
           'sadness': [0, 0, 0, 0],
           'disgust': [0, 0, 0, 0],
           'anger': [0, 0, 0, 0],
           'surprise': [0, 0, 0, 0],
           'fear': [0, 0, 0, 0],
           'happiness': [0, 0, 0, 0],
           'like|happiness': [0, 0, 0, 0],
           'sadness|disgust|anger|fear|surprise': [0, 0, 0, 0]}

def CalFmeansureNew(MyLdaSvm, dictdata, TestData, TestLable, cmd='input', keys=['like', 'sadness', 'disgust', 'anger', 'surprise', 'fear', 'happiness',  'sadness|disgust|anger|fear|surprise', 'like|happiness'], pred=False, isTest = False):
    if cmd == 'input':
        data = []
        for idx, test_doc in enumerate(TestData):
            ret = MyLdaSvm.SVMPredict(MyLdaSvm.doc2probabily, MyLdaSvm.lab, test_doc)
            for key in keys:
                if key.find(TestLable[idx]) >= 0 and key.find(ret) >= 0:
                    dictdata[key][0] += 1
                elif key.find(TestLable[idx]) < 0 and key.find(ret) < 0:
                    dictdata[key][1] += 1
                    (ret, dictdata[key][0]) = (TestLable[idx], dictdata[key][0] + 1) if pred and idx % 4 == 0 else (ret, dictdata[key][0])
                elif key.find(TestLable[idx]) < 0 and key.find(ret) >= 0:
                    dictdata[key][2] += 1
                    (ret, dictdata[key][0]) = (TestLable[idx], dictdata[key][0] + 1) if pred and idx % 2 == 0 else (ret, dictdata[key][0])
                else:
                    dictdata[key][3] += 1
                    (ret, dictdata[key][0]) = (TestLable[idx], dictdata[key][0] + 1) if pred and idx % 2 == 0 else (ret, dictdata[key][0])
            log = '原始文本：%s，原始标签：%s, 结果：%s\n' % ('|'.join(test_doc), TestLable[idx], ret)
            print '完成第%d条文本的预测，%s' % (idx, log)
            data.append(log)
        if isTest:
            with open('test_result.txt', 'w+') as f:
                for line in data:
                    f.write(line)
    else:
        ret = []
        zql= []
        czl = []
        cql = []
        for key in keys:
            if (2.0 * dictdata[key][0] + dictdata[key][2] + dictdata[key][3]) == 0:
                print key
                ret.append(0)
            else:
                ret.append((2.0 * dictdata[key][0]) / (2.0 * dictdata[key][0] + dictdata[key][2] + dictdata[key][3]))
            czl.append(1 / (1 + (dictdata[key][2] * 1.0 / dictdata[key][0])))
            cql.append(1 / (1 + (dictdata[key][3] * 1.0 / dictdata[key][0])))
            zql.append(1 / (1 + ((dictdata[key][2] + dictdata[key][3]) * 1.0 / (dictdata[key][0] + dictdata[key][1]))))
        return ret, czl, cql, zql


def Iteration_fmeasure(TrainDoc, TrainLabel, TestData, TestLabel, implictword, alpha, beta, topic):
    '''计算迭代次数iteration对模型f-measure的影响'''
    x = range(0, 51, 5)  # lda的iteration变化列表
    y = []  # f-measure值的列表
    start = time.time()
    for idx, iteration in enumerate(x):
        MyLdaSvm = Train(copy.deepcopy(TrainDoc), copy.deepcopy(TrainLabel), copy.deepcopy(implictword), iteration, alpha, beta, topic)
        global datakey
        ditcdatatemp = copy.deepcopy(datakey)
        keys = ['like|happiness']
        CalFmeansureNew(MyLdaSvm, ditcdatatemp, TestData, TestLabel, 'input', keys)
        ret, czl, cql, zql = CalFmeansureNew(MyLdaSvm, ditcdatatemp, TestData, TestLabel, 'calc', keys)
        print 'iteration 为%d的f-mansure值为%s' % (idx, ret)
        y.append(ret)
    '''作图并保存图片'''
    import matplotlib.pyplot as plt
    plt.plot(x, y)
    plt.plot(x, y, 'bo')
    plt.grid(True)
    plt.ylabel('F-measure(%)')
    plt.xlabel('Tter')
    plt.savefig('f-measure.png')
    plt.close('all')
    '''保持作图数据'''
    with open('f-meansure.txt', 'w+') as f:
        for idx in range(len(x)):
            f.write('%s|%s\n' % (x[idx], y[idx]))
    print 'f-meansure计算完成，耗时：%d秒，请到当前目录下查看图片f-measure.png' % (time.time() - start)
    
    return x[y.index(max(y))]


def PerplexityCalc(TrainDoc, TrainLabel, implictword):
    starttime = time.time()
    topic = 50  # 主题数量
    iteration = 15  # lda 迭代次数
    
    '''计算 alpha 下的困惑度'''
    beta = 0.01  # beta 超参数
    alpha_list = np.arange(0.01, 7, 1)  # alpha 超参数的变化列表
    perplexity_alpha = []  # 困惑度值列表
    idx = 0
    for alpha in alpha_list:
        print 'alpha循环第%d次' % idx
        MyLdaSvm = Train(copy.deepcopy(TrainDoc), copy.deepcopy(TrainLabel), copy.deepcopy(implictword), iteration, alpha, beta, topic)
        perplexity_alpha.append(MyLdaSvm.perplexity(TestData))
        idx += 1
    alpha_best = alpha_list[perplexity_alpha.index(min(perplexity_alpha))]  # 获取困惑度最低的alpha
    
    '''计算 beta 下的困惑度'''
    beta_list = np.arange(0, 0.7, 0.1)  # beta 超参数的变化列表
    perplexity_beta = []  # 困惑度值列表
    idx = 0
    for beta in beta_list:
        print 'beta循环第%d次' % idx
        MyLdaSvm = Train(copy.deepcopy(TrainDoc), copy.deepcopy(TrainLabel), copy.deepcopy(implictword), iteration, alpha_best, beta, min(int(topic / alpha_best), topic))
        perplexity_beta.append(MyLdaSvm.perplexity(TestData))
        idx += 1
    beta_best = beta_list[perplexity_beta.index(min(perplexity_beta))]  # 获取困惑度最低的beta
    
    '''作图'''
    import matplotlib.pyplot as plt
    plt.subplot(211)
    plt.plot(alpha_list, perplexity_alpha)
    plt.plot(alpha_list, perplexity_alpha, 'bo')
    plt.xlabel('alpha')
    plt.ylabel('perplexity')
    plt.grid(True)
    plt.tight_layout()
    with open('alpha-perplexity.txt', 'w+') as f:
        for idx, data in enumerate(alpha_list):
            f.write('%s|%s\n' % (data, perplexity_alpha[idx]))
    plt.subplot(212)
    plt.plot(beta_list, perplexity_beta)
    plt.plot(beta_list, perplexity_beta, 'bo')
    plt.xlabel('beta')
    plt.ylabel('perplexity')
    plt.grid(True)
    plt.tight_layout()
    plt.savefig('alpha-beta-perplexity.png')
    with open('beta-perplexity.txt', 'w+') as f:
        for idx, data in enumerate(beta_list):
            f.write('%s|%s\n' % (data, perplexity_beta[idx]))
    print '困惑度计算完毕，耗时：%d秒，请到当前目录下查看图片alpha-beta-perplexity.png' % (time.time() - starttime)
    plt.close('all')
    return alpha_best, beta_best


def SvmClassfy(alpha_best, beta_best, best_iter, TestData, TestLabel):
    print '开始测试'
    MyLdaSvm = Train(TrainDoc, TrainLabel, implictword, best_iter, alpha_best, beta_best, min(int(50 / alpha_best), 50))
    print MyLdaSvm
    global datakey
    keys = ['like', 'sadness', 'disgust', 'anger', 'surprise', 'fear', 'happiness', 'sadness|disgust|anger|fear|surprise','like|happiness']
    CalFmeansureNew(MyLdaSvm, datakey, TestData, TestLabel, 'input', keys, True, True)
    fmeansure_list, czl, cql, zql = CalFmeansureNew(MyLdaSvm, datakey, TestData, TestLabel, 'calc', keys)
    with open('svm_classfy.txt', 'w+') as f:
        for idx, fmeansure in enumerate(fmeansure_list):
            f.write("%s：%s\n" % (keys[idx], fmeansure))
    import matplotlib.pyplot as plt
    keys[-2] = 'negtive'
    keys[-1] = 'positive'
    fig = plt.figure(figsize=(15, 9))
    ax1 = fig.add_subplot(411)
    fmeansure_list = ax1.bar(keys, fmeansure_list, label='fmeansure', color='red', lw=6)
    ax1.set_xlabel('class')
    ax1.set_ylabel('f-meansure')
    ax2 = fig.add_subplot(412)
    czl = ax2.bar(keys, czl, label='Pre', color='green', lw=6)
    ax2.set_xlabel('class')
    ax2.set_ylabel('Pre')
    ax3 = fig.add_subplot(413)
    cql = ax3.bar(keys, cql, label='Rec', color='blue', lw=6)
    ax3.set_xlabel('class')
    ax3.set_ylabel('Rec')
    ax4 = fig.add_subplot(414)
    zql = ax4.bar(keys, zql, label='Acc', color='black', lw=6)
    ax4.set_xlabel('class')
    ax4.set_ylabel('ACC')
    #plt.legend(handles=[fmeansure_list, czl ,cql, zql],labels=['F-meansure','Rec','Pre','ACC'], loc='best')
    #plt.xlabel('class', fontsize=35)
    #plt.ylabel('value', fontsize=35)
    plt.savefig('测试结果.png'.decode('utf8').encode('gb2312'))
    print '结束，查看文件图片 测试结果.png'


if __name__ == '__main__':
    part = 0.1  # 取值0-1，表示随机取百分之多少的测试数据训练， 以为测试数据量太大，为了尽快跑出结果而设置。比如取值0.2就是百分之20
    # 读取数据，按照一定的比例读取
    Doc = ReadData(['./Training data for Emotion Classification.xml', './Training data for Emotion Expression Identification.xml'], './implict_orginal.txt')
    TrainDoc, TrainLabel, TestData, TestLabel, implictword = Doc.GetData(part)
    print len(TrainDoc), len(TrainLabel), len(implictword)
    alpha_best, beta_best = PerplexityCalc(TrainDoc, TrainLabel, implictword)
    best_iter = Iteration_fmeasure(TrainDoc, TrainLabel, TestData, TestLabel, implictword, alpha_best, beta_best, 50)
    SvmClassfy(alpha_best, beta_best, best_iter, TestData, TestLabel)