doc文档无法通过docx包读取,需转换为docx。
docx格式的文件本质上是一个ZIP文件。
如果将后缀改为zip就能看清内部结构。
docx提取word中的表格非常方便。
这段代码还可以提取docx中的图片。
import os, shutil, xlwt, re
from docx import Document
from win32com import client as wc
path=r'C:\Users\Administrator\Desktop\0221\正常发布的'
excelpath=os.path.join(path, "..\\")
target = os.path.join(path, "..\\docx")
imgPath = r'C:\Users\Administrator\Desktop\0221\pic'
word = wc.Dispatch("Word.Application")
imgabspath = ''
if not os.path.exists(target):
os.mkdir(target)
if not os.path.exists(imgPath):
os.mkdir(imgPath)
for root, dirs, files in os.walk(path):
#print(root,files)
for name in files:
if name.endswith('.doc'):
doc = word.Documents.Open(os.path.join(root, name))
docxabs = os.path.join(target, os.path.basename(name)+'x')
doc.SaveAs(docxabs, 12)
doc.Close()
elif name.endswith('.docx'):
docxabs = os.path.join(target, os.path.basename(name))
shutil.copyfile(os.path.join(root, name),docxabs)
word.Quit()
workbook = xlwt.Workbook(encoding = 'GBK')
worksheet = workbook.add_sheet('公司列表')
worksheet.write(0,0,'单位名称')
worksheet.write(0,1,'营业执照(副本)注册号')
worksheet.write(0,2,'单位性质')
worksheet.write(0,3,'单位地址')
worksheet.write(0,4,'联系人')
worksheet.write(0,5,'联系电话')
worksheet.write(0,6,'单位简介')
worksheet.write(0,7,'招聘信息')
worksheet.write(0,8,'营业执照')
for root, dirs, files in os.walk(target):
c_row = 1
for name in files:
print(name)
d = Document(os.path.join(root, name))
t = d.tables
company=t[0].cell(0,1).text.replace('\n', '').replace(' ', '')
worksheet.write(c_row,0,company)
worksheet.write(c_row,1,t[0].cell(0,3).text)
worksheet.write(c_row,2,t[0].cell(1,1).text)
worksheet.write(c_row,3,t[0].cell(2,1).text)
worksheet.write(c_row,4,re.findall(r'[\u4E00-\u9FA5]+',t[0].cell(2,3).text))
worksheet.write(c_row,5,re.findall("[0-9]{11}",t[0].cell(2,3).text))
if t[0].cell(3,0).text == '单位简介':
worksheet.write(c_row,6,t[0].cell(3,1).text)
worksheet.write(c_row,7,t[0].cell(4,1).text)
elif t[0].cell(4,0).text == '单位简介':
worksheet.write(c_row,6,t[0].cell(4,1).text)
worksheet.write(c_row,7,t[0].cell(5,1).text)
if ".docx" not in name:
continue
#subImgPath = os.path.join(imgPath, company)
#if not os.path.exists(subImgPath):
# os.makedirs(subImgPath)
imgabspath = ''
for rel in d.part._rels:
rel = d.part._rels[rel] #获得资源
if "image" not in rel.target_ref:
continue
imgName = re.findall("/(.*)",rel.target_ref)[0]
extension = os.path.splitext(imgName)[1]
imgabspath = imgPath + "/" + company + extension
with open(imgabspath,"wb") as f:
f.write(rel.target_part.blob)
if os.path.isfile(imgabspath):#os.listdir(subImgPath):
worksheet.write(c_row,8,imgabspath)
os.rename(os.path.join(root, name), os.path.join(root, company+'.docx'))
c_row += 1
workbook.save(os.path.join(excelpath, '公司招聘列表.xls'))