统计PDF文件的字符数、词语数和句子数

最新推荐文章于 2024-10-09 11:40:13 发布

爱弹琴的小蚂蚁

最新推荐文章于 2024-10-09 11:40:13 发布

阅读量398

点赞数 2

文章标签： c# 开发语言

本文链接：https://blog.csdn.net/weixin_45051800/article/details/140964808

版权

#遍历文件夹下的所有PDF文件，统计每个文件的字符数、词语数和句子数，输出为Excel文件
import PyPDF2
import jieba
import thulac
import os
from openpyxl import Workbook
import logging

# 设置日志记录
logging.basicConfig(level=logging.ERROR, format='%(asctime)s - %(levelname)s - %(message)s')

def analyze_pdf(pdf_path):
    # 初始化THULAC分词器
    thu1 = thulac.thulac(seg_only=True)  # 只分词不分句

    try:
        # 打开PDF文件
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text()

        # 清理文本中的空白行
        text = '\n'.join([line for line in text.split('\n') if line.strip()])

        # 计算字数
        char_count = len(text)

        # 使用jieba进行分词
        words = list(jieba.cut(text))
        word_count = len(words)

        # 使用THULAC进行分句
        sentences = thu1.cut(text, text=True).split("\n")
        # 去除空句子
        sentences = [s for s in sentences if s]
        sentence_count = len(sentences)

        return char_count, word_count, sentence_count

    except Exception as e:
        logging.error(f"Error processing file {pdf_path}: {e}")
        return None, None, None

# 定义四个空列表
char_counts = []
word_counts = []
sentence_counts = []
banks = []

# 指定包含PDF文件的文件夹路径
folder_path = r"C:\Users\xxy_2\Desktop\临时"  # 替换为你的文件夹路径

# 遍历文件夹下的所有PDF文件
for filename in os.listdir(folder_path):
    if filename.endswith('.pdf'):
        pdf_path = os.path.join(folder_path, filename)
        char_count, word_count, sentence_count = analyze_pdf(pdf_path)

        # 如果没有返回None，则将结果添加到列表中
        if char_count is not None and word_count is not None and sentence_count is not None:
            char_counts.append(char_count)
            word_counts.append(word_count)
            sentence_counts.append(sentence_count)
            banks.append(filename)

# 创建一个新的Excel工作簿
wb = Workbook()
ws = wb.active
ws.title = "PDF Analysis"

# 写入表头
ws['A1'] = 'Bank'
ws['B1'] = 'Character Count'
ws['C1'] = 'Word Count'
ws['D1'] = 'Sentence Count'

# 写入数据
for i in range(len(banks)):
    ws.cell(row=i + 2, column=1, value=banks[i])
    ws.cell(row=i + 2, column=2, value=char_counts[i])
    ws.cell(row=i + 2, column=3, value=word_counts[i])
    ws.cell(row=i + 2, column=4, value=sentence_counts[i])

# 保存Excel文件
excel_file_path = r"C:\Users\xxy_2\Desktop\年报测试.xlsx"
wb.save(excel_file_path)

print(f"分析结果已保存到: {excel_file_path}")