挖掘新词

最新推荐文章于 2023-04-07 11:05:48 发布

favomj

最新推荐文章于 2023-04-07 11:05:48 发布

阅读量448

点赞数

分类专栏： python

本文链接：https://blog.csdn.net/qq_34798152/article/details/79381893

版权

python 专栏收录该内容

21 篇文章 0 订阅

订阅专栏

理论：http://www.matrix67.com/blog/archives/5044

代码参考：https://github.com/yanghanxy/New-Word-Detection

import codecs
import re
import pandas as pd
import math

class WordInfo(object):
    def __init__(self,text):
        self.text = text
        self.left_entropy = 0.0
        self.right_entropy = 0.0
        self.freq = 0.0
        self.pmi = 0.0
        self.nub = 0

        self.left_word = []
        self.right_word = []

    def lupdate(self,word):
        self.left_word.append(word)

    def rupdate(self,word):
        self.right_word.append(word)
    
    def update(self,word_cad):
        if len(self.text) > 1:
            self.computer_pmi(word_cad)
        self.compute_indexes()

    def compute_freq(self,length):
        self.freq = 1.0 * self.nub / length

    def computer_pmi(self,word_cad):
        sub_part = [ (self.text[0:i],self.text[i:]) for i in range(1,len(self.text)) ]
        if len(sub_part) > 0 :
            self.pmi =  min(
                map(lambda word: math.log(self.freq / word_cad[word[0]].freq / word_cad[word[1]].freq), sub_part))
    
    def compute_entropy(self,_list):
        length = float(len(_list))
        frequence = {}
        if length == 0:
            return 0
        else:
            for i in _list:
                frequence[i] = frequence.get(i, 0) + 1
            return sum(map(lambda x: - x / length * math.log(x / length), frequence.values()))
          
    def compute_indexes(self):
        # compute frequency of word,and left/right entropy
        self.left_entropy = self.compute_entropy(self.left_word)
        self.right_entropy = self.compute_entropy(self.right_word)



class Seg(object):

    def __init__(self, doc, max_word_len = 5,  min_tf=0.000005, min_entropy=0.07, min_pmi=6.0 ):
        super(Seg, self).__init__()
        self.doc = doc
        self.max_word_len = max_word_len
        self.min_entropy = min_entropy
        self.min_tf = min_tf
        self.min_pmi = min_pmi
        self.word_info = self.get_words(doc)

        count = float(len(self.word_info))

        self.avg_frq = sum(map(lambda w: w.freq, self.word_info)) / count
        self.avg_entropy = sum(map(lambda w: min(w.left_entropy, w.right_entropy), self.word_info)) / count
        self.avg_pmi = sum(map(lambda w: w.pmi, self.word_info)) / count
        filter_function = lambda f: len(f.text) > 1 and f.pmi > self.min_pmi and f.freq > self.min_tf \
                                    and min(f.left_entropy, f.right_entropy) > self.min_entropy
        self.word_tf_pmi_ent = map(lambda w: (w.text, len(w.text), w.freq, w.pmi, min(w.left_entropy, w.right_entropy)),
                                   filter(filter_function, self.word_info))

    def extract_cadicateword(self,_doc, _max_word_len):
        indexes = []
        doc_length = len(_doc)
        for i in range(doc_length):
            for j in range(i + 1, min(i + 1 + _max_word_len, doc_length + 1)):
                indexes.append((i, j))

        return sorted(indexes, key=lambda _word: _doc[_word[0]:_word[1]])

    def get_words(self,doc):
        pattern = re.compile(u'[\\s\\d,.<>/?:;\'\"[\\]{}()\\|~!@#$%^&*\\-_=+a-zA-Z，。《》、？：；“”‘’｛｝【】（）…￥！—┄－]+')
        doc = pattern.sub(r'', doc)
        print(len(doc))
        word_index = self.extract_cadicateword(doc, self.max_word_len)
        word_cad = {}
        print(len(word_index))
        for index in word_index:
            word = doc[index[0]:index[1]]
            if word not in word_cad:
                word_cad[word] = WordInfo(word)
            word_cad[word].nub += 1
            word_cad[word].lupdate(doc[index[0]-1:index[0]])
            word_cad[word].rupdate(doc[index[1]:index[1]+1])

        length = len(doc)
        # computing frequency of candicate word and entropy of left/right neighbors
        for word in word_cad:
            word_cad[word].compute_freq(length)
        print('1')

        # ranking by length of word
        values = sorted(word_cad.values(), key=lambda x: len(x.text))
        print(len(values))
        print('2')
        for v in values:
            v.update(word_cad)

        print('3')
        # ranking by freq
        return sorted(values, key=lambda v: len(v.text), reverse=False)

if __name__ == '__main__':
    path = 'xiyouji.txt'
    doc = codecs.open('xiyouji.txt', "r", "utf-8").read()

    word = Seg(doc, max_word_len=3, min_tf=(1e-08), min_entropy=1.0, min_pmi=3.0)
    print('avg_frq:' + str(word.avg_frq))
    print('avg_pmi:' + str(word.avg_pmi))
    print('avg_entropy:' + str(word.avg_entropy))

    wordlist = []
    for i in word.word_tf_pmi_ent:
        wordlist.append([i[0], i[1], i[2], i[3], i[4]])

    wordlist = sorted(wordlist, key=lambda word: word[3], reverse=True)
    wordlist = sorted(wordlist, key=lambda word: word[4], reverse=True)

    seg = pd.DataFrame(wordlist, columns=['word', 'length', 'fre', 'pmi', 'entropy'])
    seg.to_csv( 'extractword2.csv', index=False, encoding="utf-8")