理论:http://www.matrix67.com/blog/archives/5044
代码参考:https://github.com/yanghanxy/New-Word-Detection
import codecs
import re
import pandas as pd
import math
class WordInfo(object):
def __init__(self,text):
self.text = text
self.left_entropy = 0.0
self.right_entropy = 0.0
self.freq = 0.0
self.pmi = 0.0
self.nub = 0
self.left_word = []
self.right_word = []
def lupdate(self,word):
self.left_word.append(word)
def rupdate(self,word):
self.right_word.append(word)
def update(self,word_cad):
if len(self.text) > 1:
self.computer_pmi(word_cad)
self.compute_indexes()
def compute_freq(self,length):
self.freq = 1.0 * self.nub / length
def computer_pmi(self,word_cad):
sub_part = [ (self.text[0:i],self.text[i:]) for i in range(1,len(self.text)) ]
if len(sub_part) > 0 :
self.pmi = min(
map(lambda word: math.log(self.freq / word_cad[word[0]].freq / word_cad[word[1]].freq), sub_part))
def compute_entropy(self,_list):
length = float(len(_list))
frequence = {}
if length == 0:
return 0
else:
for i in _list:
frequence[i] = frequence.get(i, 0) + 1
return sum(map(lambda x: - x / length * math.log(x / length), frequence.values()))
def compute_indexes(self):
# compute frequency of word,and left/right entropy
self.left_entropy = self.compute_entropy(self.left_word)
self.right_entropy = self.compute_entropy(self.right_word)
class Seg(object):
def __init__(self, doc, max_word_len = 5, min_tf=0.000005, min_entropy=0.07, min_pmi=6.0 ):
super(Seg, self).__init__()
self.doc = doc
self.max_word_len = max_word_len
self.min_entropy = min_entropy
self.min_tf = min_tf
self.min_pmi = min_pmi
self.word_info = self.get_words(doc)
count = float(len(self.word_info))
self.avg_frq = sum(map(lambda w: w.freq, self.word_info)) / count
self.avg_entropy = sum(map(lambda w: min(w.left_entropy, w.right_entropy), self.word_info)) / count
self.avg_pmi = sum(map(lambda w: w.pmi, self.word_info)) / count
filter_function = lambda f: len(f.text) > 1 and f.pmi > self.min_pmi and f.freq > self.min_tf \
and min(f.left_entropy, f.right_entropy) > self.min_entropy
self.word_tf_pmi_ent = map(lambda w: (w.text, len(w.text), w.freq, w.pmi, min(w.left_entropy, w.right_entropy)),
filter(filter_function, self.word_info))
def extract_cadicateword(self,_doc, _max_word_len):
indexes = []
doc_length = len(_doc)
for i in range(doc_length):
for j in range(i + 1, min(i + 1 + _max_word_len, doc_length + 1)):
indexes.append((i, j))
return sorted(indexes, key=lambda _word: _doc[_word[0]:_word[1]])
def get_words(self,doc):
pattern = re.compile(u'[\\s\\d,.<>/?:;\'\"[\\]{}()\\|~!@#$%^&*\\-_=+a-zA-Z,。《》、?:;“”‘’{}【】()…¥!—┄-]+')
doc = pattern.sub(r'', doc)
print(len(doc))
word_index = self.extract_cadicateword(doc, self.max_word_len)
word_cad = {}
print(len(word_index))
for index in word_index:
word = doc[index[0]:index[1]]
if word not in word_cad:
word_cad[word] = WordInfo(word)
word_cad[word].nub += 1
word_cad[word].lupdate(doc[index[0]-1:index[0]])
word_cad[word].rupdate(doc[index[1]:index[1]+1])
length = len(doc)
# computing frequency of candicate word and entropy of left/right neighbors
for word in word_cad:
word_cad[word].compute_freq(length)
print('1')
# ranking by length of word
values = sorted(word_cad.values(), key=lambda x: len(x.text))
print(len(values))
print('2')
for v in values:
v.update(word_cad)
print('3')
# ranking by freq
return sorted(values, key=lambda v: len(v.text), reverse=False)
if __name__ == '__main__':
path = 'xiyouji.txt'
doc = codecs.open('xiyouji.txt', "r", "utf-8").read()
word = Seg(doc, max_word_len=3, min_tf=(1e-08), min_entropy=1.0, min_pmi=3.0)
print('avg_frq:' + str(word.avg_frq))
print('avg_pmi:' + str(word.avg_pmi))
print('avg_entropy:' + str(word.avg_entropy))
wordlist = []
for i in word.word_tf_pmi_ent:
wordlist.append([i[0], i[1], i[2], i[3], i[4]])
wordlist = sorted(wordlist, key=lambda word: word[3], reverse=True)
wordlist = sorted(wordlist, key=lambda word: word[4], reverse=True)
seg = pd.DataFrame(wordlist, columns=['word', 'length', 'fre', 'pmi', 'entropy'])
seg.to_csv( 'extractword2.csv', index=False, encoding="utf-8")