#遍历文件夹下的所有PDF文件,统计每个文件的字符数、词语数和句子数,输出为Excel文件
import PyPDF2
import jieba
import thulac
import os
from openpyxl import Workbook
import logging
# 设置日志记录
logging.basicConfig(level=logging.ERROR, format='%(asctime)s - %(levelname)s - %(message)s')
def analyze_pdf(pdf_path):
# 初始化THULAC分词器
thu1 = thulac.thulac(seg_only=True) # 只分词不分句
try:
# 打开PDF文件
with open(pdf_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
text = ""
for page in reader.pages:
text += page.extract_text()
# 清理文本中的空白行
text = '\n'.join([line for line in text.split('\n') if line.strip()])
# 计算字数
char_count = len(text)
# 使用jieba进行分词
words = list(jieba.cut(text))
word_count = len(words)
# 使用THULAC进行分句
sentences = thu1.cut(text, text=True).split("\n")
# 去除空句子
sentences = [s for s in sentences if s]
sentence_count = len(sentences)
return char_count, word_count, sentence_count
except Exception as e:
logging.error(f"Error processing file {pdf_path}: {e}")
return None, None, None
# 定义四个空列表
char_counts = []
word_counts = []
sentence_counts = []
banks = []
# 指定包含PDF文件的文件夹路径
folder_path = r"C:\Users\xxy_2\Desktop\临时" # 替换为你的文件夹路径
# 遍历文件夹下的所有PDF文件
for filename in os.listdir(folder_path):
if filename.endswith('.pdf'):
pdf_path = os.path.join(folder_path, filename)
char_count, word_count, sentence_count = analyze_pdf(pdf_path)
# 如果没有返回None,则将结果添加到列表中
if char_count is not None and word_count is not None and sentence_count is not None:
char_counts.append(char_count)
word_counts.append(word_count)
sentence_counts.append(sentence_count)
banks.append(filename)
# 创建一个新的Excel工作簿
wb = Workbook()
ws = wb.active
ws.title = "PDF Analysis"
# 写入表头
ws['A1'] = 'Bank'
ws['B1'] = 'Character Count'
ws['C1'] = 'Word Count'
ws['D1'] = 'Sentence Count'
# 写入数据
for i in range(len(banks)):
ws.cell(row=i + 2, column=1, value=banks[i])
ws.cell(row=i + 2, column=2, value=char_counts[i])
ws.cell(row=i + 2, column=3, value=word_counts[i])
ws.cell(row=i + 2, column=4, value=sentence_counts[i])
# 保存Excel文件
excel_file_path = r"C:\Users\xxy_2\Desktop\年报测试.xlsx"
wb.save(excel_file_path)
print(f"分析结果已保存到: {excel_file_path}")
统计PDF文件的字符数、词语数和句子数
最新推荐文章于 2024-10-09 11:40:13 发布