b
# 语句字数分布统计-句子长度的分布和文体分析(风格、写作质量、结构等)
# 1 把文本分割成语句。
# 2 中文案例 政府工作报告 re + Counter - 柱状图
# 3 英文案例 就职演说 NLTK + Counter - 直方图
# 1 把文本分割成语句。
# 英文:(1)目录章节用换行表示结束;(2)Ms.中的缩写符号、(3)引号中有句号,并不表示结束
# 使用NLTK模块 NLTK sent_tokenize函数进行分割
# 中文:比英文含有更多的复杂情况,获取文本时,根据文本的写作方法调整程序。
# 比如括号中的句子,一些情况中独立成句,而一些情况则不。
# 正则表达式函数re.split(pattern,string),用于分割字符串。
# 2 中文案例 政府工作报告 re+Counter - 柱状图
import re
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
def process_files(file_paths):
sentences = []
for file_path in file_paths:
with open(file_path, 'r', encoding='utf-8') as file:
for line in file.readlines():
sentences.append(line.strip())
sentences = ''.join(sentences)
sentences = re.split('。(?!」)|\n', re.sub(' ', '', sentences))
while '' in sentences: sentences.remove('')
# sentences = [i for i in sentences if i!= '']
print(sentences)
cnt = Counter([len(x) for x in sentences])
top_200 = sorted(cnt.items(), key=lambda x: x[1], reverse=True)[:200]
print(top_200)
print(len(top_200))
lengths = [length for length, count in top_200]
frequencies = [count for length, count in top_200]
return lengths, frequencies
txt_files = ['/Users/zitongqiu/Documents/data mining/data/1954.txt',
'/Users/zitongqiu/Documents/data mining/data/1992.txt']
colors = ['b', 'g']
legend_labels = ['1954政府工作报告','1992政府工作报告']
plt.figure(figsize=(16, 10))
for i, file in enumerate(txt_files):
lengths, frequencies = process_files([file])
plt.bar(lengths, frequencies, color=colors[i], alpha=0.5)
# for i, file in enumerate(txt_files):
# lengths, frequencies = process_files([file])
# plt.hist(lengths, density=True,bins=10, color=colors[i], alpha=0.5)
plt.xlabel('语句字数',fontsize=12, color='navy')
plt.ylabel('频次',fontsize=12, color='navy')
plt.title('1954年和1992年政府工作报告的语句字数统计',fontsize=22, color='navy')
plt.xticks(np.arange(0, max(lengths)+1, 10))
plt.legend(legend_labels)
plt.show()
# 3 英文案例 就职演说 NLTK模块 NLTK sent_tokenize + Counter - 直方图
import nltk
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import inaugural
nltk.download('inaugural')
speeches = ['1789-Washington.txt', '1961-Kennedy.txt', '2021-Biden.txt']
labels = ['1789华盛顿', '1961肯尼迪', '2021拜登']
colors = ['b', 'g', 'yellowgreen']
plt.figure(figsize=(15, 10))
for speech, label, color in zip(speeches, labels, colors):
sents = nltk.tokenize.sent_tokenize(inaugural.raw(speech))
nstring = np.array([len(sent.split()) for sent in sents])
plt.hist(nstring,color=color,alpha=0.7,label=label,width=1,density=False)
plt.title('1789华盛顿/1961肯尼迪/2021拜登 就职演说语句单词数量分布', fontsize=20, color='navy')
plt.xlabel('句子中单词数量', fontsize=14, color='navy')
plt.xticks(np.arange(0, max(lengths)+1, 10))
plt.ylabel('频次', fontsize=14, color='navy')
plt.legend()
plt.show()