批量分析word文档中的数据
1 读取文件名字
import docx
import re, os
word_name_list = os.listdir(r'C:\Users\xue\Desktop\各种表\2019级研究生培养计划\all')
word_name_list = [i for i in word_name_list if i.endswith('.doc') and '$' not in i]
print(len(word_name_list),'\n',word_name_list[:10])
输出
2 尝试采用py-win32
py-win32可以同时处理doc和docx
import docx
import re, os
from win32com.client import Dispatch
word = Dispatch('Word.Application') # 打开word应用程序
# word = DispatchEx('Word.Application') # 启动独立的进程
word.Visible = 0 # 后台运行,不显示
word.DisplayAlerts = 0 # 不警告
path = r"C:\Users\xue\Desktop\各种表\2019级研究生培养计划\all\畅檀应用统计QSZ20190070.doc"
doc = word.Documents.Open(FileName=path, Encoding='gbk')
# for para in doc.paragraphs: # 会出现表格数据读取不全
# print(para.Range.Text)
for t in doc.Tables: 3 # 依然容易出错!!并且出现表格数据读取不全
for row in t.Rows:
for cell in row.Cells:
print(cell.Range.Text)
doc.Close()
word.Quit
输出出错!
3 改用docx模块
docx模块只能处理docx后缀文件!
import pythoncom
import os
import win32com.client as wc
4 .doc文件转化为.docx文件
4.1 先将所有文件转化为docx
root = r'C:\Users\xue\Desktop\各种表\2019级研究生培养计划\all'
# 找出doc文件名字
doc_name_list = os.listdir(root)
doc_name_list = [i for i in doc_name_list if i.endswith('.doc') and '$' not in i]
print(len(doc_name_list),'\n',doc_name_list[:3])
4.2 找出docx文件名字
docx_name_list = os.listdir(root)
docx_name_list = [i for i in docx_name_list if i.endswith('.docx') and '$' not in i]
print(len(docx_name_list),'\n',docx_name_list[:])
4.3 先创建一个放docx的文件夹
docx_dir = root + '\\docx'
if not os.path.exists(docx_dir):
os.makedirs(docx_dir)
4.4 从最后开始替换某字符串几次
def rreplace(s, old, new, occurrence):
li = s.rsplit(old, occurrence)
return new.join(li)
4.5 doc转化为docx
def doc_to_docx(doc_name):
pythoncom.CoInitialize()
try:
word = wc.Dispatch("Word.Application") # 打开word应用程序
doc = word.Documents.Open(doc_name, Encoding='utf-8')
# 上面的地方只能使用完整绝对地址,相对地址找不到文件,
# 并且,只能用“\\”,不能用“/”,哪怕加了 r 也不行,涉及到将反斜杠看成转义字符。
doc_name = rreplace(doc_name, "\\", "\\docx\\", 1)
doc.SaveAs(doc_name.replace(".doc", ".docx"), 12, False, "", True, "", False, False, False, False)
# 转换后的文件,12代表转换后为docx文件
doc.Close
except Exception as e:
print(e.message)
finally:
# 对com操作,一定要确保退出word应用
if word:
word.Quit
del word
# 释放资源
pythoncom.CoUninitialize()
4.6 改变doc文件名字为docx,并存到docx文件夹下面
for i in doc_name_list:
doc_to_docx(root + '\\' + i)
4.7 移动docx文件到docx文件夹
import shutil
for i in docx_name_list:
shutil.move(root + '\\' + i, root + "\\docx")
5 读取所有的docx列表(就是所有已经转化为docx的文件)
import os
root = r"C:\Users\xue\Desktop\各种表\2019级研究生培养计划\all\docx" # 改变根路径
word_name_list = os.listdir(root)
word_name_list = [i for i in word_name_list if '.docx' in i and '$' not in i]
print(len(word_name_list),'\n',word_name_list[:5])
6 读取docx文件中的所有字符串
import docx
def read_docx(fn):
s = ''
doc = docx.Document(fn)
# # 按段落读取全部数据
# for paragraph in doc.paragraphs:
# print(paragraph.text)
# doc = docx.Document(fn)
# # 按段落读取全部数据
# for paragraph in doc.paragraphs:
# print(paragraph.text)
# 按表格读取全部数据
text_list = []
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
if cell.text not in text_list:
text_list.append(cell.text.strip())
s = ''.join(text_list)
return s
# table_num = len(doc.tables)
# # 获取文档的表格个数
# print(table_num)
# table_0 = doc.tables[0]
# # 选取第一个表
# table_rows = len(table_0.rows)
# # 获取第一个表的行数
# print(table_rows)
# tab = doc.tables[0].rows[0].cells[0]
# # 获取第一张表第一行第一列数据
# print(tab.text)
# par = doc.paragraphs[2]
# # 读取第三段数据
# print(par.text)
7 读取和匹配信息
import re
information = []
for i, item in enumerate(word_name_list):
text = read_docx(root + '\\' + item)
# print(text[:1000])
pattern = re.compile(r'名(.*?)入')
names = pattern.findall(text)[0]
names = names.replace(' ', '')
pattern = re.compile(r'职务(.*?)专')
school = max(pattern.findall(text)[0].split(), key = len)
school = school.replace(' ', '')
years = re.compile(r'学制(.*?)入学').findall(text)[0]
kind = re.compile(r'类别(.*?)入学').findall(text)[0]
major = re.compile(r'称(.*?)研').findall(text)[0]
field = re.compile(r'向(.*?)等').findall(text)[0]
# print('-'* 30)
info = (i, names, school, years, kind, major, field)
information.append(info)
print(info)
print(len(information), information[:3], information[-3:], sep = '\n')
输出
8 测试代码(分析数据,编写正则)
# 读取和匹配信息
import re
information = []
text = read_docx(root + '\\' + word_name_list[27])
# print(text[:1000])
pattern = re.compile(r'名(.*?)入')
names = pattern.findall(text)[0]
names = names.replace(' ', '')
pattern = re.compile(r'职务(.*?)专')
school = pattern.findall(text)
print(school)
# print(max(school, key=len))
years = re.compile(r'学制(.*?)入学').findall(text)[0]
kind = re.compile(r'类别(.*?)入学').findall(text)[0]
major = re.compile(r'称(.*?)研').findall(text)[0]
field = re.compile(r'向(.*?)等').findall(text)[0]
# print('-'* 30)
info = (str(i), names, school, years, kind, major, field)
information.append(info)
print(info)
# print(len(information), information[:3], information[-3:], sep = '\n')
输出
9 存入数据到磁盘
9.1 存入txt
path = r"C:\Users\xue\Desktop\各种表\2019级研究生培养计划\2019所有研究生信息汇总.txt"
with open(path, 'w') as fp:
fp.write('\n'.join('%s %s %s %s %s %s %s' % x for x in information))
9.2 转化为pandas并存入csv
import pandas as pd
path = r"C:\Users\xue\Desktop\各种表\2019级研究生培养计划\2019所有研究生信息汇总.csv"
data = information
print(data[:5])
df = pd.DataFrame(data=data, columns=['序号','姓名','本科学校','学制','学位类别','专业','研究方向'])
df = df.reindex(index = df['序号']) # 重新设置索引为第一列
df = df.drop(columns=['序号'], axis=1) # 删除“序号”这一列
df = df.rename_axis("序号", axis=0).rename_axis(None, axis=1) # 设置行索引的名称
df.to_csv(path)
输出
9.3 存入excel
path = r"C:\Users\xue\Desktop\各种表\2019级研究生培养计划\2019所有研究生信息汇总.xls"
df.to_excel(path)