def read_file(file):
"""接收文件名为参数,将文件中的内容读为字符串,
只保留文件中的英文字母和西文符号,
过滤掉中文(中文字符及全角符号Unicode编码都大于256)
将所有字符转为小写,
将其中所有标点、符号替换为空格,返回字符串
"""
with open(file, 'r', encoding='utf-8') as data:
string_sign = '!"\'-#$%&()*+,./:;<=>?@[\\]^_‘{|}~'
string_txt = data.read()
# print(string_txt)
for letter in string_txt:
if not letter.isascii():
string_txt = string_txt.replace(letter, '')
string_txt = string_txt.lower()
# print(string_txt)
# string_txt = string_txt.lower().replace('\n', ' ')
# print(string_txt)
for letter in string_txt:
if letter in string_sign:
string_txt = string_txt.replace(letter, ' ')
return string_txt
def count_of_words(txt):
"""接收去除标点、符号的字符串,统计并返回其中单词数量和不重复的单词数量"""
txt = txt.replace('\n', ' ')
ls1 = txt.split(' ')
for item in ls1.copy():
if item.isspace():
ls1.remove(item)
elif item == '':
ls1.remove(item)
# elif item.isdigit():
# ls1.remove(item)
print(ls1)
print(set(ls1))
return (len(ls1), len(set(ls1)))
def word_frequency(txt):
"""接收去除标点、符号的字符串,统计并返回每个单词出现的次数
返回值为字典类型,单词为键,对应出现的次数为值"""
# print(txt)
txt = txt.replace('\n', ' ')
ls1 = txt.split(' ')
for item in ls1.copy():
if item.isspace():
ls1.remove(item)
elif item == '':
ls1.remove(item)
elif item.isdigit():
ls1.remove(item)
# print(ls1)
import collections
ctr = collections.Counter(ls1)
return dict(ctr)
def top_ten_words(frequency, cnt):
"""接收词频字典,输出出现次数最多的cnt个单词及其出现次数"""
frequency = dict(sorted(frequency.items(), key=lambda x:x[1], reverse=True))
# print(frequency)
ls1 = list(frequency.keys())[:cnt]
ls2 = list(frequency.values())[:cnt]
for i in range(len(ls1)):
print(f'{ls1[i]} {ls2[i]}')
def top_ten_words_no_excludes(frequency, cnt):
"""接收词频字典,去除常见的冠词、代词、系动词和连接词后,输出出现次数最多的
cnt个单词及其出现次数,需排除的单词如下:
excludes_words = ['a', 'an', 'the', 'i', 'he', 'she', 'his', 'my', 'we',
'or', 'is', 'was', 'do', 'and', 'at', 'to', 'of', 'it', 'on', 'that', 'her',
'c','in', 'you', 'had','s', 'with', 'for', 't', 'but', 'as', 'not', 'they',
'be', 'were', 'so', 'our','all', 'would', 'if', 'him', 'from', 'no', 'me',
'could', 'when', 'there','them', 'about', 'this', 'their', 'up', 'been',
'by', 'out', 'did', 'have']
"""
excludes_words = ['a', 'an', 'the', 'i', 'he', 'she', 'his', 'my', 'we',
'or', 'is', 'was', 'do', 'and', 'at', 'to', 'of', 'it', 'on', 'that', 'her',
'c', 'in', 'you', 'had', 's', 'with', 'for', 't', 'but', 'as', 'not', 'they',
'be', 'were', 'so', 'our', 'all', 'would', 'if', 'him', 'from', 'no', 'me',
'could', 'when', 'there', 'them', 'about', 'this', 'their', 'up', 'been',
'by', 'out', 'did', 'have']
ls_keys = list(frequency.keys())
for word in excludes_words:
if word in ls_keys:
del frequency[word]
frequency = dict(sorted(frequency.items(), key=lambda x: x[1], reverse=True))
ls1 = list(frequency.keys())[:cnt]
ls2 = list(frequency.values())[:cnt]
for i in range(len(ls1)):
print(f'{ls1[i]} {ls2[i]}')
if __name__ == '__main__':
filename = 'Who Moved My Cheese.txt' # 文件名
content = read_file(filename) # 调用函数返回字典类型的数据
frequency_result = word_frequency(content) # 统计词频
cmd = input()
if cmd == '1':
n = int(input())
print(content[:n])
elif cmd == '2':
amount_results = count_of_words(content)
print('文章共有单词{}个,其中不重复单词{}个'.format(*amount_results))
elif cmd == '3':
n = int(input())
top_ten_words(frequency_result, n)
elif cmd == '4':
n = int(input())
top_ten_words_no_excludes(frequency_result, n)
词频统计python
最新推荐文章于 2024-04-30 19:24:04 发布