[文本挖掘] 7 语句字数分布统计政府工作报告数据以及就职演说数据-CSDN博客

本文链接：https://blog.csdn.net/uforfor1/article/details/132515844

# 语句字数分布统计-句子长度的分布和文体分析（风格、写作质量、结构等）
# 1 把文本分割成语句。
# 2 中文案例 政府工作报告 re + Counter - 柱状图
# 3 英文案例 就职演说 NLTK + Counter - 直方图

# 1 把文本分割成语句。
# 英文：(1)目录章节用换行表示结束；(2)Ms.中的缩写符号、(3)引号中有句号，并不表示结束
        # 使用NLTK模块 NLTK sent_tokenize函数进行分割
# 中文：比英文含有更多的复杂情况，获取文本时，根据文本的写作方法调整程序。
#      比如括号中的句子，一些情况中独立成句，而一些情况则不。
#      正则表达式函数re.split(pattern，string),用于分割字符串。


# 2 中文案例 政府工作报告 re+Counter - 柱状图

import re
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np


def process_files(file_paths):
    sentences = []
    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file.readlines():
                sentences.append(line.strip())
    sentences = ''.join(sentences)
    sentences = re.split('。(?!」)|\n', re.sub(' ', '', sentences))
    while '' in sentences: sentences.remove('')
    # sentences = [i for i in sentences if i!= '']
    print(sentences)

    cnt = Counter([len(x) for x in sentences])
    top_200 = sorted(cnt.items(), key=lambda x: x[1], reverse=True)[:200]
    print(top_200)
    print(len(top_200))

    lengths = [length for length, count in top_200]
    frequencies = [count for length, count in top_200]
    return lengths, frequencies

txt_files = ['/Users/zitongqiu/Documents/data mining/data/1954.txt',
             '/Users/zitongqiu/Documents/data mining/data/1992.txt']

colors = ['b', 'g']
legend_labels = ['1954政府工作报告','1992政府工作报告']

plt.figure(figsize=(16, 10))

for i, file in enumerate(txt_files):
    lengths, frequencies = process_files([file])
    plt.bar(lengths, frequencies, color=colors[i], alpha=0.5)

# for i, file in enumerate(txt_files):
#     lengths, frequencies = process_files([file])
#     plt.hist(lengths, density=True,bins=10, color=colors[i], alpha=0.5)


plt.xlabel('语句字数',fontsize=12, color='navy')
plt.ylabel('频次',fontsize=12, color='navy')
plt.title('1954年和1992年政府工作报告的语句字数统计',fontsize=22, color='navy')
plt.xticks(np.arange(0, max(lengths)+1, 10))
plt.legend(legend_labels)
plt.show()


# 3 英文案例 就职演说 NLTK模块 NLTK sent_tokenize + Counter - 直方图
import nltk
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import inaugural
nltk.download('inaugural')

speeches = ['1789-Washington.txt', '1961-Kennedy.txt', '2021-Biden.txt']
labels = ['1789华盛顿', '1961肯尼迪', '2021拜登']
colors = ['b', 'g', 'yellowgreen']

plt.figure(figsize=(15, 10))

for speech, label, color in zip(speeches, labels, colors):
    sents = nltk.tokenize.sent_tokenize(inaugural.raw(speech))
    nstring = np.array([len(sent.split()) for sent in sents])
    plt.hist(nstring,color=color,alpha=0.7,label=label,width=1,density=False)

plt.title('1789华盛顿/1961肯尼迪/2021拜登 就职演说语句单词数量分布', fontsize=20, color='navy')
plt.xlabel('句子中单词数量', fontsize=14, color='navy')
plt.xticks(np.arange(0, max(lengths)+1, 10))
plt.ylabel('频次', fontsize=14, color='navy')
plt.legend()
plt.show()