python文本词频统计

最新推荐文章于 2023-10-25 22:16:54 发布

布道天下

最新推荐文章于 2023-10-25 22:16:54 发布

阅读量1.1k

点赞数 2

文章标签： python

本文链接：https://blog.csdn.net/mn3321/article/details/114951086

版权

英文文本词频统计

以大仲马的《基督山伯爵》和莎士比亚的《哈姆雷特》英文版为例。

中文文本词频统计

以我吃西红柿的《吞噬星空》和罗贯中的《三国演义》中文版为例。

# testPython.py

# 引入jieba库
import jieba 

def getText(textName, type):	
	fp = open(textName, 'r', encoding='utf-8')
	text = fp.read()
	
	if ("en" == type):
		text = text.lower()
		# '!"#$%&()*+,-./:;<=>?@[\\]^_‘{|}~'
		# ',<>.?;:"[\\]/{}|=+-_()*&^%$#@!~`'
		for tempChar in ',<>.?;:"[\\]/{}|=+-_()*&^%$#@!~`':
			text.replace(tempChar, " ")

	# 无法对中文字符进行过滤
		
	fp.close()
	
	return text

def getTextCount(text, type):
	# 定义字典类型（集合类型）统计
	counts = {}

	if "en" == type:
		# 以空格做分隔符，对单词文本做处理，获取独立单词
		words = text.split()
		# 对单词做统计处理
		for word in words:
			counts[word] = counts.get(word, 0) + 1
	else:
		# 使用jieba库处理中文文本
		words = jieba.lcut(text)
	
		# 对单词做统计处理
		for word in words:
			if 1 == len(word):
				continue    #对中文特殊字符做过滤
			elif "诸葛亮" == word[0:3] or "孔明" == word[0:2]:
				temp = "孔明"
			elif "玄德" == word[0:2]:
				temp = "刘备"
			elif "丞相" == word[0:2]:
				temp = "曹操"
			elif "关公" == word[0:2] or "云长" == word[0:2]:
				temp = "关羽"
			else:
				temp = word
			
			counts[temp] = counts.get(temp, 0) + 1

	# 将字典转换成列表，做排序操作
	lsCnt = list(counts.items())

	# 以计数做比较基础，降序排列
	lsCnt.sort(key = lambda word:word[1], reverse = True)

	return lsCnt



def main():
	# 获取英文文本所有单词文本
	#text = getText("The Count of Monte Cristo.txt", "en")
	#text = getText("hamlet.txt", "en")

	# 获取中文本所有单词文本
	#text = getText("吞噬星空.txt", "cn")
	text = getText("三国演义.txt", "cn")

	# 对英文单词文本做处理，然后统计单词
	#lsCnt = getTextCount(text, "en")

	# 对中文单词文本做处理，然后统计单词
	lsCnt = getTextCount(text, "cn")

	# 输出打印文本单词中频繁出现的单词和计数
	for i in range(20):
		word, cnt = lsCnt[i]
		print("{0:<10}{1:>5}".format(word, cnt))

main()