提取ngram

想念@思恋

于 2020-10-07 15:38:51 发布

阅读量401

点赞数

分类专栏： python编程

本文链接：https://blog.csdn.net/tailonh/article/details/108951365

版权

python编程专栏收录该内容

139 篇文章 10 订阅

订阅专栏

import re
def extract_ngram(all_sentences, min_feq=0):
	# all_sentences = [['迈','向','充','满','希','望','的','新','世','纪','-','-','一','九','九','八','年',...],[...],...]
	n_gram_dict = {}
	new_all_sentences = []
	for sen in all_sentences:
		str_sen = ''.join(sen)
		new_sen = re.split(u'[^\u4e00-\u9fa50-9a-zA-Z]', str_sen)
		for s in new_sen:
			if len(s) > 0:
				new_all_sentences.append(s) #二维列表转换为一维列表
	for sentence in new_all_sentences:
		for i in range(len(sentence)):
			for n in range(1, 6):
				if i + n > len(sentence):
					break
				n_gram = ''.join(sentence[i:i+n])
				if n_gram not in n_gram_dict:
					n_gram_dict[n_gram] = 1
				else:
					n_gram_dict[n_gram] += 1
	new_ngram_dict = {gram: c for gram, c in n_gram_dict.items() if c > min_feq}
	return new_ngram_dict