'''
引用自:涛笙依旧_,仅限学习交流用,如有侵权请告知。
'''
stop_word_path = r'E:\论文\算法\停用词表\hit_stopwords.txt'
def get_stop_word(stop_word_path):
f = open(stop_word_path,encoding='utf-8')
stop_words = list()
for stop_word in f.readlines():
stop_words.append(stop_word[:-1])
return stop_words
def text_generator(file_path):
txts = glob.glob(f'{file_path}/*.txt')
for txt in txts:
d = codecs.open(txt, encoding='utf-8').read()
title = d.split("\n")[0]
d = d.replace(u'\u3000', '').strip()
yield title,re.sub(u'[^\u4e00-\u9fa50-9a-zA-Z ]+', '', d)
class NewWordFind():
def __init__(self, n_gram=5, min_p=2 , min_entropy=1, max_score=100, min_score=2):
'''
input:
n_gram: int n_gram 的 粒度
min_p: int 最小 信息熵 阈值
min_entropy: int 左右熵 阈值
max_score: int 综合得分最大阈值
min_score: int 综合得分最小阈值
'''
self.n_gram = n_gram
self.min_p = min_p
self.min_entropy = min_entropy
self.max_score = max_score
self.min_score = min_score
def n_gram_words(self,text):
"""
功能:将 text 进行 n_gram
input:
text : String 输入句子
return:
words_freq:Dict 词频 字典
"""
words = []
for i in range(1,self.n_gram+1):
words += [text[j:j+i] for j in range(len(text)-i+1)]
words_freq = dict(Counter(words))
new_words_freq = {}
for word,freq in words_freq.items():
new_words_freq[word]=freq
return new_words_freq
def PMI_filter(self, word_freq_dic):
"""
功能:PMI 过滤掉 噪声词
input:
words_freq:Dict 词频 字典
return:
new_words_dic:Dict PMI 过滤噪声后 剩余新词
"""
new_words_dic = {}
for word in word_freq_dic:
if len(word) == 1:
pass
else:
p_x_y = min([word_freq_dic.get(word[:i])* word_freq_dic.get(word[i:]) for i in range(1,len(word))])
mpi = p_x_y/word_freq_dic.get(word)
if mpi > self.min_p:
new_words_dic[word] = [mpi]
return new_words_dic
def calculate_entropy(self, char_list):
"""
功能: 计算字符列表的熵
input:
char_list: List 字符列表
return:
entropy: float 熵
"""
char_freq_dic = dict(Counter(char_list))
entropy = (-1)*sum([ char_freq_dic.get(i)/len(char_list)*np.log2(char_freq_dic.get(i)/len(char_list)) for i in char_freq_dic])
return entropy
def Entropy_left_right_filter(self,condinate_words_dic,text):
"""
功能:通过熵阈值从限定词字典中过滤出最终的新词
input:
condinate_words_dic:Dict 限定词字典
text:String 句子
output:
final_words_list:List 最终的新词列表
"""
final_words_list = []
for word in condinate_words_dic.keys():
left_right_char =re.findall('(.)%s(.)'%word,text)
left_char = [i[0] for i in left_right_char]
left_entropy = self.calculate_entropy(left_char)
right_char = [i[1] for i in left_right_char]
right_entropy = self.calculate_entropy(right_char)
score = condinate_words_dic[word][0]-min(left_entropy,right_entropy)
if min(right_entropy,left_entropy)> self.min_entropy and score<self.max_score and score>self.min_score:
final_words_list.append({
"word":word,
"pmi":condinate_words_dic[word][0],
"left_entropy":left_entropy,
"right_entropy":right_entropy,
"score":score
})
final_words_list = sorted(final_words_list, key=lambda x: x['score'], reverse=True)
return final_words_list
stop_word= get_stop_word(r"hit_stopwords.txt")
file_path = r"file_path/"
n_gram = 5
min_p = 2
min_entropy = 1
max_score = 100
min_score = 2
new_word_find = NewWordFind( n_gram=n_gram, min_p=min_p , min_entropy=min_entropy, max_score=max_score, min_score=min_score)
for index,(title,text) in enumerate(text_generator(file_path)):
print(f"\n index :{index} => title:{title}")
for i in stop_word:
text=text.replace(i,"")
n_gram = new_word_find.n_gram_words(text)
new_words_dic = new_word_find.PMI_filter(n_gram)
new_words_list = new_word_find.Entropy_left_right_filter(new_words_dic,text)
for new_words in new_words_list:
print(f"{new_words}")