import re
def extract_ngram(all_sentences, min_feq=0):
# all_sentences = [['迈','向','充','满','希','望','的','新','世','纪','-','-','一','九','九','八','年',...],[...],...]
n_gram_dict = {}
new_all_sentences = []
for sen in all_sentences:
str_sen = ''.join(sen)
new_sen = re.split(u'[^\u4e00-\u9fa50-9a-zA-Z]', str_sen)
for s in new_sen:
if len(s) > 0:
new_all_sentences.append(s) #二维列表转换为一维列表
for sentence in new_all_sentences:
for i in range(len(sentence)):
for n in range(1, 6):
if i + n > len(sentence):
break
n_gram = ''.join(sentence[i:i+n])
if n_gram not in n_gram_dict:
n_gram_dict[n_gram] = 1
else:
n_gram_dict[n_gram] += 1
new_ngram_dict = {gram: c for gram, c in n_gram_dict.items() if c > min_feq}
return new_ngram_dict
05-15
1706