根据你的需求,这里提供一套完整的Python解决方案,支持递归遍历子文件夹、提取Word文档内容(段落+表格),并整合到Excel中。以下是代码实现及详细说明:
一个单元格一个word的全部内容
完整代码
# -*- coding: utf-8 -*-
import os
from docx import Document
import pandas as pd
from tqdm import tqdm # 需安装:pip install tqdm
def extract_word_content(doc_path):
"""提取Word文档中的段落和表格内容"""
try:
doc = Document(doc_path)
content = []
# 提取段落文本
for para in doc.paragraphs:
if para.text.strip():
content.append(('paragraph', para.text))
# 提取表格内容(按行合并)
for table in doc.tables:
for row in table.rows:
row_text = ' | '.join([cell.text.strip() for cell in row.cells])
content.append(('table_row', row_text))
return '\n'.join([f"[{
type}] {
text}" for type, text in content])
except Exception as e:
return f"Error reading file: {
str(e)}"
def process_folder(root_folder):
"""遍历文件夹并提取所有Word文档"""
data = []
# 遍历所有子文件夹
for root, _, files in os.walk(root_folder):
# 筛选Word文件(兼容.doc和.docx)
word_files = [f for f in files if f.lower().endswith(('.doc', '.docx'))]
# 显示进度条(需安装tqdm)
for file in tqdm(word_files, desc=f"Processing {
os.path.basename(root)}"):
file_path = os.path.join(root, file)
content = extract_word_content(file_path)
data.append({
"文件名": file,
"完整路径": file_path,
"文件内容": content
})
return pd.DataFrame(data)
if __name__ == "__main__":
# 配置参数
folder_path = r"C:\Users\esensoft\Desktop\报名推荐" # 修改为实际路径
output_excel = "汇总结果.xlsx"
# 执行处理
df = process_folder(folder_path)
# 导出Excel(自动处理格式)
with pd.ExcelWriter(output_excel, engine='openpyxl') as writer:
df.to_excel(writer, index=False, sheet_name='文档汇总')
# 自动调整列宽
for column in df.columns:
max_len = max(df[column].astype(str).map(len).max(), len(column))
writer.sheets['文档汇总'].column_dimensions[
writer.sheets['文档汇总'].cell(row=