python --LDA处理文章,分类提取数据

1 篇文章 0 订阅

将文章分为十类:

    def loadCorpusFromFile(self, fn, stopwords):
        # 中文分词
        f = open(fn, 'r', encoding='utf-8')
        text1 = f.readlines()
        text1 = "".join(text1)
        text1 = text1.split("。")
        text = ""
        for itext in text1:
            text2 = jieba.cut_for_search(itext)
            for itext2 in text2:
                if itext2 not in stopwords:
                    text += itext2
            text += " "
        text = text[:-1]
        jieba.load_userdict('stopwords\\userdict.txt')
        seg_generator = jieba.cut(text)
        seg_list = [i for i in seg_generator if i not in self.stop_words]
        seg_list = r' '.join(seg_list)
        # 切割统计所有出现的词纳入词典
        seglist = seg_list.split(" ")
        self.vocab = []
        for word in seglist:
            if (word != u' ' and word not in self.vocab):
                self.vocab.append(word)
        CountMatrix = []
        f.seek(0, 0)
        # 统计每个文档中出现的词频
        for line in f:
            # 置零
            count = np.zeros(len(self.vocab), dtype=np.int)
            text = line.strip()
            # 但还是要先分词
            seg_generator = jieba.cut(text)
            seg_list = [i for i in seg_generator if i not in self.stop_words and len(i) > 1]
            seg_list = r' '.join(seg_list)
            seglist = seg_list.split(" ")
            # 查询词典中的词出现的词频
            for word in seglist:
                if word in self.vocab:
                    count[self.vocab.index(word)] += 1
            CountMatrix.append(count)
        f.close()
        # self.ppCountMatrix = (len(CountMatrix), len(self.vocab))
        self.ppCountMatrix = np.array(CountMatrix)
        print("load corpus from %s success!" % fn)

进行迭代训练:

    def fitModel(self, n_iter=1500, _alpha=0.1, _eta=0.01):
        self.model = lda.LDA(n_topics=self.n_topic, n_iter=n_iter, alpha=_alpha, eta=_eta, random_state=1)
        self.model.fit(self.ppCountMatrix)

分析每个句子属于哪一类topic:

 def no_printDoc_Topic(self):
        values = []
        for i in range(0, self.n_topic):
            values.append(0)
        for i in range(len(self.ppCountMatrix)):
            values[self.model.doc_topic_[i].argmax()] += 1
        return values

    def no_printDoc_Topic_juzi(self, juzikey):
        values = []
        for i in range(len(self.ppCountMatrix)):
            values.append(0)
            if self.model.doc_topic_[i].argmax() == juzikey:
                values[i] = 1
        return values

求出topic的所有句子,并且进行分词、去掉停用词、进行词性标注之后输出名词:

# 摘要提取几句话-停用词
def load_stopwordslist(path):
    stoplist = [line.strip() for line in codecs.open(path, 'r', encoding='utf8').readlines()]
    return stoplist


# 更新zhengwen.txt,重置为zhengwen2.txt
def gengxinzhengwen(path):
    f = open(path, 'r', encoding='utf-8')
    text2 = ""
    text1 = f.readlines()
    for i in text1:
        text2 += str(i).replace('\n', '')
    text2 = text2.split("。")
    with open('cut_imgs_txt\\zhengwen2.txt', 'w', encoding='utf-8') as f1:
        for itext in text2:
            f1.write(itext)
            f1.write('\n')


# 更新zhengwen2.txt,重置为zhengwen3.txt
def cut_gengxinzhengwen(path, stop):
    f = open(path, 'r', encoding='utf-8')
    text1 = f.readlines()
    with open('cut_imgs_txt\\zhengwen3.txt', 'w', encoding='utf-8') as f1:
        for itext in text1:
            text2 = ""
            text3 = ""
            text2 = jieba.cut(itext, cut_all=False)
            for i3 in text2:
                if i3 not in stop:
                    text3 += i3
            f1.write(text3)


# 获取对应的句子将数据存放zhangwen4.txt中
def duiyingtopdejuzijihe(path, values):
    f = open(path, 'r', encoding='utf-8')
    text1 = f.readlines()
    i = 0
    with open('cut_imgs_txt\\zhengwen4.txt', 'w', encoding='utf-8') as f1:
        for itext in text1:
            if values[i] != 0:
                f1.write(itext)
                i += 1
            else:
                i += 1
                continue

# 输出关键的名词;
def mingcishuchu(path, stop):
    f = open(path, 'r', encoding='utf-8')
    text1 = f.readlines()
    value = ""
    for text in text1:
        value += str(text)
    all_key_values = []
    # 通过jieba提取相应的名词
    jieba.load_userdict('stopwords\\userdict.txt')
    words = pseg.lcut(value)
    stayed_line = ""
    for word in words:
        if word.word not in stop and word.flag == 'n':
            if word.word not in all_key_values:
                all_key_values.append(word.word)
    for i in range(0, len(all_key_values)):
        stayed_line += str(all_key_values[i]) + " "
    print(str(stayed_line))
 

核心代码:

class LDA_v20161130():

    def __init__(self, topics=2):
        self.n_topic = topics
        self.corpus = None
        self.vocab = None
        self.ppCountMatrix = None
        self.stop_words = [u',', u'。', u'、', u'(', u')', u'·', u'!', u' ', u':', u'“', u'”', u'\n']
        self.model = None

    def loadCorpusFromFile(self, fn, stopwords):
        # 中文分词
        f = open(fn, 'r', encoding='utf-8')
        text1 = f.readlines()
        text1 = "".join(text1)
        text1 = text1.split("。")
        text = ""
        for itext in text1:
            text2 = jieba.cut_for_search(itext)
            for itext2 in text2:
                if itext2 not in stopwords:
                    text += itext2
            text += " "
        text = text[:-1]
        jieba.load_userdict('stopwords\\userdict.txt')
        seg_generator = jieba.cut(text)
        seg_list = [i for i in seg_generator if i not in self.stop_words]
        seg_list = r' '.join(seg_list)
        # 切割统计所有出现的词纳入词典
        seglist = seg_list.split(" ")
        self.vocab = []
        for word in seglist:
            if (word != u' ' and word not in self.vocab):
                self.vocab.append(word)
        CountMatrix = []
        f.seek(0, 0)
        # 统计每个文档中出现的词频
        for line in f:
            # 置零
            count = np.zeros(len(self.vocab), dtype=np.int)
            text = line.strip()
            # 但还是要先分词
            seg_generator = jieba.cut(text)
            seg_list = [i for i in seg_generator if i not in self.stop_words and len(i) > 1]
            seg_list = r' '.join(seg_list)
            seglist = seg_list.split(" ")
            # 查询词典中的词出现的词频
            for word in seglist:
                if word in self.vocab:
                    count[self.vocab.index(word)] += 1
            CountMatrix.append(count)
        f.close()
        # self.ppCountMatrix = (len(CountMatrix), len(self.vocab))
        self.ppCountMatrix = np.array(CountMatrix)
        print("load corpus from %s success!" % fn)

    def setStopWords(self, word_list):
        self.stop_words = word_list

    def fitModel(self, n_iter=1500, _alpha=0.1, _eta=0.01):
        self.model = lda.LDA(n_topics=self.n_topic, n_iter=n_iter, alpha=_alpha, eta=_eta, random_state=1)
        self.model.fit(self.ppCountMatrix)

    def printTopic_Word(self, n_top_word):

        # 获取每个topic下权重最高的几个个单词
        for i, topic_dist in enumerate(self.model.topic_word_):
            if i == values.index(max(values)):
                if n_top_word != 0:
                    topic_words = np.array(self.vocab)[np.argsort(topic_dist)][:-(n_top_word + 1):-1]
                else:
                    topic_words = np.array(self.vocab)[np.argsort(topic_dist)][:-1]
                print("Topic:", i, "\t")
                for word in topic_words:
                    print(word, end=" ")
        print('\n')

    def no_printDoc_Topic(self):
        values = []
        for i in range(0, self.n_topic):
            values.append(0)
        for i in range(len(self.ppCountMatrix)):
            values[self.model.doc_topic_[i].argmax()] += 1
        return values

    def no_printDoc_Topic_juzi(self, juzikey):
        values = []
        for i in range(len(self.ppCountMatrix)):
            values.append(0)
            if self.model.doc_topic_[i].argmax() == juzikey:
                values[i] = 1
        return values

    def printVocabulary(self):
        print("vocabulary:")
        for word in self.vocab:
            print(word)
        print('\n')

    def saveVocabulary(self, fn):
        f = codecs.open(fn, 'w', 'utf-8')
        for word in self.vocab:
            f.write("%s\n" % word)
        f.close()

    def saveTopic_Words(self, fn, n_top_word=-1):
        if n_top_word == -1:
            n_top_word = len(self.vocab)
        f = codecs.open(fn, 'w', 'utf-8')
        for i, topic_dist in enumerate(self.model.topic_word_):
            topic_words = np.array(self.vocab)[np.argsort(topic_dist)][:-(n_top_word + 1):-1]
            f.write("Topic:%d\t" % i)
            for word in topic_words:
                f.write("%s " % word)
            f.write("\n")
        f.close()

    def saveDoc_Topic(self, fn):
        f = codecs.open(fn, 'w', 'utf-8')
        for i in range(len(self.ppCountMatrix)):
            f.write("Doc %d:((top topic:%s) topic distribution:%s)\n" % (
            i, self.model.doc_topic_[i].argmax(), self.model.doc_topic_[i]))
        f.close()

运行结果:

在这里插入图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

喜欢地上爬的孩子

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值