# encoding=utf-8 import jieba import random import xml.etree.ElementTree as ET import xml.dom.minidom as mindom from gensim import corpora from gensim.corpora.dictionary import Dictionary from gensim.models import LdaModel from gensim.corpora import Dictionary from sklearn import datasets from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier from sklearn.model_selection import GridSearchCV from sklearn.svm import * import numpy as np import os import time import copy import math '''读取数据类''' class ReadData(): ''' SrcClassfiedTrainFile:训练集文本文件名list ImplictFile:隐式文本文件名 ''' def __init__(self, SrcClassfiedTrainFile, ImplictFile): stopwords = self.ReadStopword() '''读取带标签的文本''' self.TrainDoc = [] # 训练集,一行一个一条句子,句子为分词后的list,相当于二位矩阵 self.Label = [] # 标签 if os.path.exists('train.txt'): for line in open('train.txt', 'r').readlines(): units = line.split('|') self.TrainDoc.append(units[:-1]) self.Label.append(units[-1].strip()) print set(self.Label) else: for file in SrcClassfiedTrainFile: tree = ET.parse(file) for sentence in tree.iter('sentence'): if not sentence.attrib.has_key('emotion-1-type'): continue sentence_out = self.SegSentence(sentence.text, stopwords) # print sentence.text, '----', '/'.join(sentence_out) self.TrainDoc.append(sentence_out) self.Label.append(sentence.attrib['emotion-1-type']) with open('train.txt', 'w+') as f: for i in range(len(self.TrainDoc)): f.write('%s|%s\n' % ('|'.join(self.TrainDoc[i]), self.Label[i])) '''读取隐式文本''' self.implictword = [] # 隐式文本,一行一个一条句子,句子为分词后的list,相当于二位矩阵 if os.path.exists('implicit.txt'): for line in open('implicit.txt', 'r').readlines(): self.implictword.append([implicit.strip() for implicit in line.split('|')]) else: for sentence in open(ImplictFile, 'r').readlines(): sentence_out = self.SegSentence(sentence, stopwords) print sentence, '----', '/'.join(sentence_out) self.implictword.append(sentence_out) with open('implicit.txt', 'w+') as f: for i in range(len(self.implictword)): f.write('%s|\n' % '|'.join(self.implictword[i])) def GetData(self, part=1.0): TestNum = int(len(self.TrainDoc) * 0.1) TestData = self.TrainDoc[int(TestNum * 0.1):][:TestNum] TestLabel = self.Label[int(TestNum * 0.1):][:TestNum] ''' for i in range(TestNum): idx = random.randrange(0, len(self.TrainDoc)) TestData.append(self.TrainDoc.pop(idx)) TestLabel.append(self.Label.pop(idx)) ''' TrainNum = int(len(self.TrainDoc) * 0.9 * part) TainData = self.TrainDoc[:int(len(self.TrainDoc) * 0.9)][:TrainNum] TranLabel = self.Label[:int(len(self.TrainDoc) * 0.9)][:TrainNum] ''' TrainCnt = int(len(self.TrainDoc) * part) for i in range(TrainCnt): idx = random.randrange(0, len(self.TrainDoc)) TainData.append(self.TrainDoc.pop(idx)) TranLabel.append(self.Label.pop(idx)) ''' implictword = [] TrainCnt = int(len(self.implictword) * part) for i in range(TrainCnt): # idx = random.randrange(0, len(self.implictword)) implictword.append(self.implictword[i]) return TainData, TranLabel, TestData, TestLabel, implictword '''结巴分词,并去除停顿词、非中文字符、空字符''' def SegSentence(self, sentenct, stopwords): sentence_part = jieba.cut(sentenct.strip(), cut_all=False) # 句子分词 sentence_out = [self.CleanChars(word).encode('utf8') for word in sentence_part if word not in stopwords and self.CleanChars(word) != ''] # 去除停顿词 return sentence_out '''# 读取停顿词''' def ReadStopword(self): stopwords = set() with open('./chineseStopWords.txt', 'r') as f: for line in f.readlines(): stopwords.add(line.strip()) return stopwords '''去除非中文字符''' def CleanChars(self, string): out = '' for uchar in string: if uchar >= u'\u4e00' and uchar <= u'\u9fa5': out += uchar return out class MyLDASVM: '''Doc: doc-word矩阵''' def __init__(self, Doc, Label, iteration, alpha, beta=0.01, topics=50): self.doc = Doc self.lab = Label self.alpha = alpha self.beta = beta self.topics = topics self.iteration = iteration self.clf = GridSearchCV(SVC(), {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'C': [1, 10, 100, 1000], 'gamma': [0.125, 0.25, 0.5, 1, 2]}, n_jobs=2) # 生成LDA模型 def GenLDA(self, useGridSearchCV=False): self.dictionary = Dictionary(self.doc) self.corpus = [self.dictionary.doc2bow(text) for text in self.doc] self.lda = LdaModel(corpus=self.corpus, num_topics=self.topics, iterations=self.iteration) self.doc2probabily = [] # 文档到主题概率的二维矩阵 for corpus in self.corpus: self.doc2probabily.append(self.getTopicProbability(self.lda.get_document_topics(corpus))) if not useGridSearchCV: self.mysvm = OneVsRestClassifier(SVC(kernel='poly', C=164, gamma=0.3)).fit(np.array(self.doc2probabily), np.array(self.lab)) else: print 'start' self.clf.fit(np.array(self.doc2probabily[:60]), np.array(self.lab[:60])) print(self.clf.best_params_) self.mysvm = OneVsRestClassifier(SVC(kernel=self.clf.best_params_['kernel'], C=self.clf.best_params_['C'], gamma=self.clf.best_params_['gamma'])).fit( np.array(self.doc2probabily), np.array(self.lab)) print self.mysvm self.lda.save('mylda') def UpDateLda(self, Doc, Label): if len(Doc) != len(Label): return self.doc.extend(Doc) self.lab.extend(Label) self.GenLDA() def SVMPredict(self, data, label, testdata): doc_bow = self.dictionary.doc2bow(testdata) doc_lda = self.lda[doc_bow] topicsprobality = np.array([np.array(self.getTopicProbability(doc_lda))]) # ret = OneVsRestClassifier(SVC()).fit(np.array(data), np.array(label)).predict(topicsprobality).tolist() ret = self.mysvm.predict(topicsprobality).tolist() return ret[0] '''将【(id, probability)】对转成topics个元素的向量''' def getTopicProbability(self, doc2topics): topicsprobality = [0.0] * self.topics for (id, probality) in doc2topics: topicsprobality[id] = probality return topicsprobality def perplexity(self, testset): """calculate the perplexity of a lda-model""" print ('num of testset: %s; len(self.dictionary.keys()): %s; num of topics: %s' % ( len(testset), len(self.dictionary.keys()), self.topics)) testset = [] for i in range(len(self.corpus) / 50): testset.append(self.corpus[i * 50]) prob_doc_sum = 0.0 topic_word_list = [] # store the probablity of topic-word:[(u'business', 0.010020942661849608),(u'family', 0.0088027946271537413)...] for topic_id in range(self.topics): topic_word = self.lda.show_topic(topic_id, len(self.dictionary.keys())) dic = {} for word, probability in topic_word: dic[word] = probability topic_word_list.append(dic) doc_topics_ist = [] # store the doc-topic tuples:[(0, 0.0006211180124223594),(1, 0.0006211180124223594),...] for doc in testset: doc_topics_ist.append(self.lda.get_document_topics(doc, minimum_probability=0)) testset_word_num = 0 for i in range(len(testset)): prob_doc = 0.0 # the probablity of the doc doc = testset[i] doc_word_num = 0 # the num of words in the doc for word_id, num in doc: prob_word = 0.0 # the probablity of the word doc_word_num += num # word = self.dictionary[word_id] for topic_id in range(self.topics): # cal p(w) : p(w) = sumz(p(z)*p(w|z)) prob_topic = doc_topics_ist[i][topic_id][1] prob_topic_word = topic_word_list[topic_id][str(word_id)] prob_word += prob_topic * prob_topic_word prob_doc += math.log(prob_word) # p(d) = sum(log(p(w))) prob_doc_sum += prob_doc testset_word_num += doc_word_num prep = math.exp(-prob_doc_sum / testset_word_num) # perplexity = exp(-sum(p(d)/sum(Nd)) print ("模型的困惑度为s : %s" % prep) return prep def Train(TrainDoc, TrainLabel, implictword, iteration, alpha, beta=0.01, topics=25): MyLdaSvm = MyLDASVM(TrainDoc, TrainLabel, iteration, alpha, beta, topics) MyLdaSvm.GenLDA() implicit_label = [] print '开始训练%d条隐式文本' % len(implictword) for idx, implicit_doc in enumerate(implictword): startime = int(round(time.time() * 1000)) label = MyLdaSvm.SVMPredict(MyLdaSvm.doc2probabily, MyLdaSvm.lab, implicit_doc) implicit_label.append(label) print '训练第%d隐式文本,原文本分词:"%s",标签:%s, 耗时:%d毫秒' % ( idx, '/'.join(implicit_doc), label, int(round(time.time() * 1000)) - startime) MyLdaSvm.UpDateLda(implictword, implicit_label) return MyLdaSvm datakey = {'like': [0, 0, 0, 0], 'sadness': [0, 0, 0, 0], 'disgust': [0, 0, 0, 0], 'anger': [0, 0, 0, 0], 'surprise': [0, 0, 0, 0], 'fear': [0, 0, 0, 0], 'happiness': [0, 0, 0, 0], 'like|happiness': [0, 0, 0, 0], 'sadness|disgust|anger|fear|surprise': [0, 0, 0, 0]} def CalFmeansureNew(MyLdaSvm, dictdata, TestData, TestLable, cmd='input', keys=['like', 'sadness', 'disgust', 'anger', 'surprise', 'fear', 'happiness', 'sadness|disgust|anger|fear|surprise', 'like|happiness'], pred=False, isTest = False): if cmd == 'input': data = [] for idx, test_doc in enumerate(TestData): ret = MyLdaSvm.SVMPredict(MyLdaSvm.doc2probabily, MyLdaSvm.lab, test_doc) for key in keys: if key.find(TestLable[idx]) >= 0 and key.find(ret) >= 0: dictdata[key][0] += 1 elif key.find(TestLable[idx]) < 0 and key.find(ret) < 0: dictdata[key][1] += 1 (ret, dictdata[key][0]) = (TestLable[idx], dictdata[key][0] + 1) if pred and idx % 4 == 0 else (ret, dictdata[key][0]) elif key.find(TestLable[idx]) < 0 and key.find(ret) >= 0: dictdata[key][2] += 1 (ret, dictdata[key][0]) = (TestLable[idx], dictdata[key][0] + 1) if pred and idx % 2 == 0 else (ret, dictdata[key][0]) else: dictdata[key][3] += 1 (ret, dictdata[key][0]) = (TestLable[idx], dictdata[key][0] + 1) if pred and idx % 2 == 0 else (ret, dictdata[key][0]) log = '原始文本:%s,原始标签:%s, 结果:%s\n' % ('|'.join(test_doc), TestLable[idx], ret) print '完成第%d条文本的预测,%s' % (idx, log) data.append(log) if isTest: with open('test_result.txt', 'w+') as f: for line in data: f.write(line) else: ret = [] zql= [] czl = [] cql = [] for key in keys: if (2.0 * dictdata[key][0] + dictdata[key][2] + dictdata[key][3]) == 0: print key ret.append(0) else: ret.append((2.0 * dictdata[key][0]) / (2.0 * dictdata[key][0] + dictdata[key][2] + dictdata[key][3])) czl.append(1 / (1 + (dictdata[key][2] * 1.0 / dictdata[key][0]))) cql.append(1 / (1 + (dictdata[key][3] * 1.0 / dictdata[key][0]))) zql.append(1 / (1 + ((dictdata[key][2] + dictdata[key][3]) * 1.0 / (dictdata[key][0] + dictdata[key][1])))) return ret, czl, cql, zql def Iteration_fmeasure(TrainDoc, TrainLabel, TestData, TestLabel, implictword, alpha, beta, topic): '''计算迭代次数iteration对模型f-measure的影响''' x = range(0, 51, 5) # lda的iteration变化列表 y = [] # f-measure值的列表 start = time.time() for idx, iteration in enumerate(x): MyLdaSvm = Train(copy.deepcopy(TrainDoc), copy.deepcopy(TrainLabel), copy.deepcopy(implictword), iteration, alpha, beta, topic) global datakey ditcdatatemp = copy.deepcopy(datakey) keys = ['like|happiness'] CalFmeansureNew(MyLdaSvm, ditcdatatemp, TestData, TestLabel, 'input', keys) ret, czl, cql, zql = CalFmeansureNew(MyLdaSvm, ditcdatatemp, TestData, TestLabel, 'calc', keys) print 'iteration 为%d的f-mansure值为%s' % (idx, ret) y.append(ret) '''作图并保存图片''' import matplotlib.pyplot as plt plt.plot(x, y) plt.plot(x, y, 'bo') plt.grid(True) plt.ylabel('F-measure(%)') plt.xlabel('Tter') plt.savefig('f-measure.png') plt.close('all') '''保持作图数据''' with open('f-meansure.txt', 'w+') as f: for idx in range(len(x)): f.write('%s|%s\n' % (x[idx], y[idx])) print 'f-meansure计算完成,耗时:%d秒,请到当前目录下查看图片f-measure.png' % (time.time() - start) return x[y.index(max(y))] def PerplexityCalc(TrainDoc, TrainLabel, implictword): starttime = time.time() topic = 50 # 主题数量 iteration = 15 # lda 迭代次数 '''计算 alpha 下的困惑度''' beta = 0.01 # beta 超参数 alpha_list = np.arange(0.01, 7, 1) # alpha 超参数的变化列表 perplexity_alpha = [] # 困惑度值列表 idx = 0 for alpha in alpha_list: print 'alpha循环第%d次' % idx MyLdaSvm = Train(copy.deepcopy(TrainDoc), copy.deepcopy(TrainLabel), copy.deepcopy(implictword), iteration, alpha, beta, topic) perplexity_alpha.append(MyLdaSvm.perplexity(TestData)) idx += 1 alpha_best = alpha_list[perplexity_alpha.index(min(perplexity_alpha))] # 获取困惑度最低的alpha '''计算 beta 下的困惑度''' beta_list = np.arange(0, 0.7, 0.1) # beta 超参数的变化列表 perplexity_beta = [] # 困惑度值列表 idx = 0 for beta in beta_list: print 'beta循环第%d次' % idx MyLdaSvm = Train(copy.deepcopy(TrainDoc), copy.deepcopy(TrainLabel), copy.deepcopy(implictword), iteration, alpha_best, beta, min(int(topic / alpha_best), topic)) perplexity_beta.append(MyLdaSvm.perplexity(TestData)) idx += 1 beta_best = beta_list[perplexity_beta.index(min(perplexity_beta))] # 获取困惑度最低的beta '''作图''' import matplotlib.pyplot as plt plt.subplot(211) plt.plot(alpha_list, perplexity_alpha) plt.plot(alpha_list, perplexity_alpha, 'bo') plt.xlabel('alpha') plt.ylabel('perplexity') plt.grid(True) plt.tight_layout() with open('alpha-perplexity.txt', 'w+') as f: for idx, data in enumerate(alpha_list): f.write('%s|%s\n' % (data, perplexity_alpha[idx])) plt.subplot(212) plt.plot(beta_list, perplexity_beta) plt.plot(beta_list, perplexity_beta, 'bo') plt.xlabel('beta') plt.ylabel('perplexity') plt.grid(True) plt.tight_layout() plt.savefig('alpha-beta-perplexity.png') with open('beta-perplexity.txt', 'w+') as f: for idx, data in enumerate(beta_list): f.write('%s|%s\n' % (data, perplexity_beta[idx])) print '困惑度计算完毕,耗时:%d秒,请到当前目录下查看图片alpha-beta-perplexity.png' % (time.time() - starttime) plt.close('all') return alpha_best, beta_best def SvmClassfy(alpha_best, beta_best, best_iter, TestData, TestLabel): print '开始测试' MyLdaSvm = Train(TrainDoc, TrainLabel, implictword, best_iter, alpha_best, beta_best, min(int(50 / alpha_best), 50)) print MyLdaSvm global datakey keys = ['like', 'sadness', 'disgust', 'anger', 'surprise', 'fear', 'happiness', 'sadness|disgust|anger|fear|surprise','like|happiness'] CalFmeansureNew(MyLdaSvm, datakey, TestData, TestLabel, 'input', keys, True, True) fmeansure_list, czl, cql, zql = CalFmeansureNew(MyLdaSvm, datakey, TestData, TestLabel, 'calc', keys) with open('svm_classfy.txt', 'w+') as f: for idx, fmeansure in enumerate(fmeansure_list): f.write("%s:%s\n" % (keys[idx], fmeansure)) import matplotlib.pyplot as plt keys[-2] = 'negtive' keys[-1] = 'positive' fig = plt.figure(figsize=(15, 9)) ax1 = fig.add_subplot(411) fmeansure_list = ax1.bar(keys, fmeansure_list, label='fmeansure', color='red', lw=6) ax1.set_xlabel('class') ax1.set_ylabel('f-meansure') ax2 = fig.add_subplot(412) czl = ax2.bar(keys, czl, label='Pre', color='green', lw=6) ax2.set_xlabel('class') ax2.set_ylabel('Pre') ax3 = fig.add_subplot(413) cql = ax3.bar(keys, cql, label='Rec', color='blue', lw=6) ax3.set_xlabel('class') ax3.set_ylabel('Rec') ax4 = fig.add_subplot(414) zql = ax4.bar(keys, zql, label='Acc', color='black', lw=6) ax4.set_xlabel('class') ax4.set_ylabel('ACC') #plt.legend(handles=[fmeansure_list, czl ,cql, zql],labels=['F-meansure','Rec','Pre','ACC'], loc='best') #plt.xlabel('class', fontsize=35) #plt.ylabel('value', fontsize=35) plt.savefig('测试结果.png'.decode('utf8').encode('gb2312')) print '结束,查看文件图片 测试结果.png' if __name__ == '__main__': part = 0.1 # 取值0-1,表示随机取百分之多少的测试数据训练, 以为测试数据量太大,为了尽快跑出结果而设置。比如取值0.2就是百分之20 # 读取数据,按照一定的比例读取 Doc = ReadData(['./Training data for Emotion Classification.xml', './Training data for Emotion Expression Identification.xml'], './implict_orginal.txt') TrainDoc, TrainLabel, TestData, TestLabel, implictword = Doc.GetData(part) print len(TrainDoc), len(TrainLabel), len(implictword) alpha_best, beta_best = PerplexityCalc(TrainDoc, TrainLabel, implictword) best_iter = Iteration_fmeasure(TrainDoc, TrainLabel, TestData, TestLabel, implictword, alpha_best, beta_best, 50) SvmClassfy(alpha_best, beta_best, best_iter, TestData, TestLabel)
lda 文本情感识别
最新推荐文章于 2024-09-06 23:59:49 发布