python-docx处理docx文档
很久没有写文章了,最近刚入职,决定捡起来再写写。
最近在做一些文档性的工作,主要就是把wiki上的内容收集起来,做成一份word文档,这边主要使用了两个库
拉取网页数据:requests_html
生成docx文档:python-docx
这边主要写一写python-docx的用法相关,以及自己写的一个脚本
python-docx常用指令
docx.add_paragraph(text, style) #插入段落,并指令文本以及格式
docx.add_heading(text, level) #插入level级别的标题(标题也是段落)
docx.paragraphs[index] #操作特定的段落
docx(filepath) #读取文档/(不写入参数时)生成空文档类
主程序部分
from docx import Document
from docx.shared import Inches
import docx
from docx.enum import dml
import requests
from requests_html import HTMLSession
import string
#main()
#设置文件名和文件路径,尝试打开文件,若打不开则新建
filePath = 'C:/Users/hyx/Desktop/py_tool_file/docx/'
fileName = 'MC-Basic Document.docx'
try:
document = Document(filePath + fileName)
except:
document = Document()
id = 0
#读取网页
session = HTMLSession()
baseWeb = 'http://softmc.servotronix.com/wiki/'
wikiurl = 'http://softmc.servotronix.com/wiki/Category:MC-Basic:Commands'
r = session.get(wikiurl)
strArray = []
for str in r.html.links:
#print(str)
tempLen = str.find('MC-Basic:')
#print (tempLen)
if tempLen == 6:
strArray.append(str[6:])
#strArray = sorted(strArray, key = str.lower())
strArray = str_list_sort(strArray)
count = 1
document.add_heading('MC-Basic Document', 0)
for s in strArray:
read_html(baseWeb + s,document,s,id)
count = count + 1
if(count > 2):
document.save(filePath + fileName)
count = 1
try:
document = Document(filePath + fileName)
except:
document = Document()
#read_html('http://softmc.servotronix.com/wiki/MC-Basic:SYSTEM.NUMBERAXES',document,'MC-Basic:SYSTEM.NUMBERAXES')
''' p = document.add_paragraph('A plain paragraph having some ')
p.add_run('bold').bold = True
p.add_run(' and some ')
p.add_run('italic.').italic = True '''
document.save(filePath + fileName)
访问网页并存储数据部分
需要注意的是,使用docx.add_heading
所生成的标题是不存在自动书签的,因此在这里我们安装了python-docx
的bookmark
分支手动添加书签,用于后续的超链接指向
def read_html(url, document, Name,id):
document.add_heading(Name, level=1)
bookmark = document.start_bookmark(Name)
document.end_bookmark(bookmark)
session = HTMLSession()
#baseWeb = 'http://softmc.servotronix.com'
#url = baseWeb + '/wiki/MC-Basic:LOC'
reText = ''
tail = 'See Also'
tail2 = 'Retrieved from '
r = session.get(url)
#print(r.headers,'\r\n')
#r.encoding = 'utf-8'
#print(r.html.text)
StartStr = '中文(简体)'
StartLoc = r.html.text.find(StartStr)
StartStrLen = 0
if StartLoc > 0:
StartStrLen = len(StartStr) + 1
else:
StartStr = 'Jump to: navigation, search'
StartLoc = r.html.text.find(StartStr)
StartStrLen = len(StartStr) + 1
strTemp = ''
returnStr = ''
StartLoc = StartLoc + StartStrLen
#print(StartLoc)
SFStr = 'Short form'
SFLoc = r.html.text.find(SFStr, StartLoc)
if SFLoc > 0:
EndLoc = SFLoc
strTemp = r.html.text[StartLoc: EndLoc]
returnStr = find_return_value(strTemp)
document.add_paragraph(r.html.text[StartLoc: EndLoc])
StartLoc = EndLoc + len(SFStr) + 1
p = document.add_heading(SFStr, level=2)