不同格式的word文档合并为一个文件。
问题:
文件汇编,需要将70多个文件汇编成一个到一个文件里。最终汇编的方式是用word--插入--对象--文件中的文字。
但是由于原始文件有的设置了自动编号,合并文档后,编号导致混乱。有的没有设置标题样式,提取不出来目录。
最终汇总的时候,需要清除自动编号的域,已经生成的编号转换成文字,统一设置标题格式。
同一个段里,字体还不一样,需要清除Word的标题样式保留字体格式。
解决方式:
开始想用python+docx库解决。但是docx库在原始文档没有style的情况下,无论如何添加不进去。
if match(run.text,"第*章*"): #匹配模式为问号,及匹配一个任意字符
document.add_heading("",level=1)
paragraph.style = document.styles['heading 1']
这样的总是出错,因为heading不存在,查了官方文档,说word默认有很多样式,但是你不起用的话,文档里是没有这个样式的。docx文档里也没说怎么解决。
"no style with name 'Heading 1'"
换解决思路:
用 win32com 操作
里面用到一句
ActiveDocument.ConvertNumbersToText
在VBA里运行没有问题,但是win32调用也出错。
查找过程中查到一个用WIN32直接运行宏命令的方式,算球,全部代码写在vba里,python循环目录和重命名就好。这样的好处是,python只负责循环和重命名,有需求直接修改vba.bas就好了,宏兼容性最高。
于是代码如下:
import os
import glob
from win32com.client import constants
from win32com.client import Dispatch #需要安装的是pypiwin32模块
App=Dispatch('Word.Application')
App.Visible=0
def addmaco(file):
App.DisplayAlerts = 0
doc = App.Documents.Open(file)
#导入宏代码
#通过文档实例,获取VBProject的组件,其中VBComponents中的参数至关重要,因为ThisDocument表示该文档,也就是说所有这篇生成文档的操作在该组件中都可以捕获#到,那么就可以在里面创建Document_Open函数来监控文档被打开
docCode = doc.VBProject.VBComponents("ThisDocument").CodeModule
macro = ""
#vba.bas为宏文件,需要导入到ThisDocument,ThisDocument即对应word下,按Alt+F11,调出vba窗口,该文档下的Microsoft Word对象下的ThisDocument。
string=open("vba.bas", encoding='utf-8')
macro=string.read()
docCode.AddFromString(macro)
#doc.Save() #添加宏这里不能保存,因为word无法保存为不含宏的文件.但是添加宏以后就可以运行了。python里就是用到这个功能
def DelMacro(file): #添加宏
App.DisplayAlerts = 0
doc = App.Documents.Open(file)
doc.Save()
def get_file_path(root_path,file_list,dir_list):
#获取该目录下所有的文件名称和目录名称
dir_or_files = os.listdir(root_path)
for dir_file in dir_or_files:
#获取目录或者文件的路径
dir_file_path = os.path.join(root_path,dir_file)
#判断该路径为文件还是路径
if os.path.isdir(dir_file_path):
dir_list.append(dir_file_path)
#递归获取所有文件和目录的路径
get_file_path(dir_file_path,file_list,dir_list)
else:
file_list.append(dir_file_path)
root_path = r"D:\企业制度"
file_list = []
dir_list = []
path='d:/'
get_file_path(root_path,file_list,dir_list)
for dirlst in dir_list:
path=dirlst
print(path)
preName="" #二级目录
dir=(dirlst.split("\\"))
if (len(dir))>2:
preName=(dir[2])
old_names = os.listdir(path) #取路径下的文件名,生成列表
for old_name in old_names: #遍历列表下的文件名
if os.path.isfile(path + "\\" +old_name): #防止文件夹加上.pdf后缀,先判断是否是文件,必须带路径
if old_name.endswith('.doc') or old_name.endswith('.docx'):
file=path + "\\" +old_name
print("打开:"+file)
doc = App.Documents.Open(file)
addmaco(file)
App.Application.Run("numTotext")
filename=App.ActiveDocument.Sentences(1).Text.replace('\r','') #删除字符串里的回车符
if filename=="": #如果第一行为空,取第二行
filename=App.ActiveDocument.Sentences(2).Text.replace('\r','')
filename=filename.replace('\t','')
filename=filename.replace('\t','')
#filenamePara=str(App.ActiveDocument.Paragraphs(1)).replace('\r','')
#filenamePara=filenamePara.replace('\t','')
#print(filenamePara)
print(filename)
newpath="d:\\企业制度统一格式\\"+preName
Newfile=newpath+"\\"+preName+"_"+str(filename)
print(Newfile)
doc.SaveAs(Newfile,16) #将所有doc,docx都统一保存为docx,后面带16
doc.Close()
App.Quit()
vba.bas文件如下
Sub numTotext()
'数字转文本
'仅list是 ActiveDocument.range.listformat.ConvertNumbersToText
ActiveDocument.ConvertNumbersToText '自动编号的数字会变成文本,但是域还在
'删除第一行'单位名称'
Selection.HomeKey Unit:=wdStory
Selection.Find.ClearFormatting
Selection.Find.Replacement.ClearFormatting
With Selection.Find
.Text = "单位名称^p" '包括换行
.Replacement.Text = ""
.Forward = True
.Wrap = wdFindAsk
.Format = False
.MatchCase = False
.MatchWholeWord = False
.MatchByte = True
.MatchWildcards = False
.MatchSoundsLike = False
.MatchAllWordForms = False
.Execute Replace:=wdReplaceOne
End With
Selection.HomeKey Unit:=wdStory '回到首行
Dim searchStr
searchStr = Chr(13)
If (InStr(ActiveDocument.Paragraphs(1).Range.Text, searchStr) = 1) Then '如果第一行是回车,删除
Selection.Range.Delete
End If
'去除底纹
Selection.WholeStory
Options.DefaultHighlightColorIndex = wdNoHighlight
Selection.Range.HighlightColorIndex = wdNoHighlight
'Sub 清除Word的标题样式保留字体格式()
Dim para As Paragraph
Dim fnt As Font
Dim pfmt As Variant
Dim lineup1 As Variant
Dim lineup2 As Variant
Dim linedo1 As Variant
Dim linedo2 As Variant
Dim fntName As Variant
Dim fntsize As Variant
Dim shSj As Variant
Dim lineSpace As Variant
Dim lnSpcrule As Variant
Dim alnMent As Variant
With ActiveDocument.Styles("正文").Font
.NameFarEast = "仿宋_GB2312"
.NameAscii = "Calibri"
.NameOther = "Calibri"
.Name = "Calibri"
.Size = 16
End With
'无标记
With ActiveWindow.View.RevisionsFilter
.Markup = wdRevisionsMarkupNone
.View = wdRevisionsViewFinal
End With
'接受修订
ActiveDocument.AcceptAllRevisions
ActiveDocument.TrackRevisions = False
For Each para In ActiveDocument.Paragraphs
With para
'If .Style <> ActiveDocument.Styles("正文") 不同的文章的正文格式也不一样,所以无论什么格式通通成正文
Set fnt = .Range.Font '转换样式之前,先记下来字体
Set pfmt = .Style.ParagraphFormat
fntName = .Range.Font.Name
'MsgBox (.Range.Text)
fntsize = .Range.Font.Size
alnMent = .Range.ParagraphFormat.Alignment
shSj = .Range.ParagraphFormat.FirstLineIndent
'lineup1 = .Style.ParagraphFormat.LineUnitBefore
'lineup2 = .Style.ParagraphFormat.SpaceBefore
'linedo1 = .Style.ParagraphFormat.LineUnitAfter
'linedo2 = .Style.ParagraphFormat.SpaceAfter
'fnt = .Range.Font
'pfmt = .Range.ParagraphFormat.LineSpacing
lineup1 = .Range.ParagraphFormat.LineUnitBefore
lineup2 = .Range.ParagraphFormat.SpaceBefore
linedo = .Range.ParagraphFormat.LineUnitAfter
linedo2 = .Range.ParagraphFormat.SpaceAfter
lnSpcrule = .Range.ParagraphFormat.LineSpacingRule
lineSpace = .Range.ParagraphFormat.LineSpacing
.Style = ActiveDocument.Styles("正文")
.Range.ListFormat.RemoveNumbers '清除自动编号的代码
.Range.Font = fnt
.Range.Font.Name = fntName
.Range.ParagraphFormat.Alignment = alnMent
.Range.Font.Size = fntsize
'.Range.ParagraphFormat.LineSpacing = pfmt
If lineSpace < =1 Then
lineSpace = 1
End If
.Range.ParagraphFormat.LineSpacing = lineSpace
'.Range.ParagraphFormat = pfmt
.Range.ParagraphFormat.FirstLineIndent = shSj
.Range.ParagraphFormat.LineUnitBefore = lineup1
.Range.ParagraphFormat.SpaceBefore = lineup2
.Range.ParagraphFormat.LineUnitAfter = linedo1
.Range.ParagraphFormat.SpaceAfter = linedo2
.Range.ParagraphFormat.LineSpacingRule = lnSpcrule
.Range.ParagraphFormat.LineSpacing = lineSpace
'End If
End With
Next
'设置页面大小
'With ActiveDocument.PageSetup
' .LineNumbering.Active = False
' .Orientation = wdOrientPortrait
' .TopMargin = CentimetersToPoints(3.7)
' .BottomMargin = CentimetersToPoints(3.5)
' .LeftMargin = CentimetersToPoints(2.6)
' .RightMargin = CentimetersToPoints(2.6)
'End With
With ActiveDocument.Styles("标题 1").Font
.NameFarEast = "方正小标宋简体"
.Size = 22
End With
With ActiveDocument.Styles("标题 1").ParagraphFormat
.LeftIndent = CentimetersToPoints(0)
.RightIndent = CentimetersToPoints(0)
.SpaceBefore = 0
.SpaceBeforeAuto = False
.SpaceAfter = 0
.SpaceAfterAuto = False
.LineSpacingRule = wdLineSpaceSingle
.Alignment = wdAlignParagraphCenter
.WidowControl = True
.KeepWithNext = False
.KeepTogether = False
.PageBreakBefore = False
End With
With ActiveDocument.Styles("标题 1")
.AutomaticallyUpdate = False
.BaseStyle = "正文"
.NextParagraphStyle = "标题 2"
End With
ActiveDocument.Paragraphs(1).Range.Select
Selection.Style = ActiveDocument.Styles("标题 1")
For i = 1 To 10 '循环十次,替换空格
Selection.WholeStory
Selection.Find.ClearFormatting
Selection.Find.Replacement.ClearFormatting
With Selection.Find
.Text = "章 "
.Replacement.Text = "章 "
.Execute Replace:=wdReplaceAll
End With
Next i
'添加分页
Selection.EndKey Unit:=wdStory
Selection.InsertBreak Type:=wdPageBreak
End Sub