Python 解析 Word、Excel 样式

公共文件 common.py

把文件的地址都放在这个文件了

# 文件地址
def filePath():
    return 'D:\\document\\test.docx'


# Word 中的文件(图片) 保存的文件地址
def wordImagesSavePath():
    return 'D:\\image\\'

Doc 转 Docx(需Windows操作系统)

from win32com import client


def doc2docx(fn):
	# 这里要注意(调用系统运行程序)与 office 版本匹配,wps抢先版是 “kwps.Application”,还有“wps” / "word"
    word = client.Dispatch("kwps.Application")
    doc = word.Documents.Open(fn)
    doc.SaveAs("{}x".format(fn), 12)
    doc.Close()
    word.Quit()
版本程序
Microsoft OfficeWord.Application
Wpswps.Application
Wps 抢先版kwps.Application

解析 Docx 样式

from docx import Document


# 获取带样式的文本列表
def getContent(path):
    # 初始化 Docx
    doc = Document(path)

    # 声明列表
    wordStyles = []

    for para in doc.paragraphs:
        # 段落文本
        section = {
        	# 段落对齐方式
            'alignment': para.paragraph_format.alignment,
            # 左缩进
            'leftIndent': para.paragraph_format.left_indent,
            # 右缩进
            'rightIndent': para.paragraph_format.right_indent,
            # 首行缩进
            'firstLineIndent': para.paragraph_format.first_line_indent,
            # 行间距
            'lineSpacing': para.paragraph_format.line_spacing,
            # 段前间距
            'spaceBefore': para.paragraph_format.space_before,
            # 段后间距
            'spaceAfter': para.paragraph_format.space_after,
            # 样式
            'style': []
        }
        for run in para.runs:
            # 声明字典
            style = {
            	# 字体名称
                'name': run.font.name,
                # 字体大小
                'size': run.font.size,
                # 是否加粗
                'bold': run.font.bold,
                # 是否斜体
                'italic': run.font.italic,
                # 字体颜色
                'rgb': run.font.color.rgb,
                # 字体高亮
                'highlightColor': run.font.highlight_color,
                # 下划线
                'underline': run.font.underline,
                # 删除线
                'strike': run.font.strike,
                # 双删除线
                'doubleStrike': run.font.double_strike,
                # 下标
                'subscript': run.font.subscript,
                # 上标
                'superscript': run.font.superscript,
                # 文本信息
                'text': run.text
            }
            section['style'].append(style)

        wordStyles.append(section)

    return wordStyles


解析 Docx 图片(行号)

import docx
from docx.document import Document
from docx.text.paragraph import Paragraph
from docx.image.image import Image
from docx.parts.image import ImagePart
from docx.oxml.shape import CT_Picture
from PIL import Image
from io import BytesIO

# 这里引用了文章开头的 common.py 获取图片保存地址的
from common.common import wordImagesSavePath


def getPicture(document: Document, paragraph: Paragraph):
    img = paragraph._element.xpath('.//pic:pic')
    if not img:
        return
    img: CT_Picture = img[0]
    embed = img.xpath('.//a:blip/@r:embed')[0]
    related_part: ImagePart = document.part.related_parts[embed]
    image: Image = related_part.image
    return image


# P1:文档地址 P2:保存图片的名称
def getPictures(path, fileNamePrefix):
    doc = docx.Document(path)

    i = 0

    imageIndex = []

    for para in doc.paragraphs:
        i = i + 1
        image = getPicture(doc, para)
        # 二进制内容
        if image is not None:

            index = {
                'line': i,
                'picture': str(ext)
            }

            # 将文件下标保存
            imageIndex.append(index)

            blob = image.blob
            ext = image.ext

            # 保存图片到本地
            im = Image.open(BytesIO(blob))
            im.save(wordImagesSavePath() + fileNamePrefix + str(i) + '.' + str(ext))

    return imageIndex

解析 Docx 表格

from docx import Document


def getTables(path):
    
    # 初始化 Docx
    doc = Document(path)

    # 获取文档中表格信息
    tables = doc.tables  # 获取文档中所有表格对象的列表

    if tables is not None:
        for table in tables:
            # 获取一个表格的所有单元格
            cells = table._cells

            # 获取单元格内所有文字信息
            contents = [cell.text for cell in cells]
            step = len(table.columns)

            # 将表格全量内容依据列数分组
            return [contents[i:i + step] for i in range(0, len(contents), step)]

创建 Docx 文档

import io
import sys
from docx import Document
from docx.shared import RGBColor
import json


def hexToRgb(value):
    value = value.lstrip('#')
    lv = len(value)
    return tuple(int(value[i:i + lv // 3], 16) for i in range(0, lv, lv // 3))


def jsonHandler(jsonStr):
    jsonArray = json.loads(jsonStr)
    return jsonArray


def createWord(entities, path):
    doc = Document()

    jsonArray = jsonHandler(entities)

    # 获取列表长度
    lon = len(jsonArray)

    p = 0

    for wordEntity in jsonArray:
        doc.add_paragraph('')
        if 'aligns' in wordEntity.keys():
            if wordEntity['aligns'] is not None:
                doc.paragraphs[p].paragraph_format.alignment = int(wordEntity['aligns'])

        p = p + 1

    i = 0

    for para in doc.paragraphs:

        i = i + 1

        items = jsonArray[i - 1]['items']

        for item in items:
            run = para.add_run()
            # 设置文本
            if 'content' in item.keys():
                run.text = item['content']
            # 设置颜色
            if 'color' in item.keys():
                rgb = hexToRgb(item['color'])
                run.font.color.rgb = RGBColor(rgb[0], rgb[1], rgb[2])
            # 设置加粗
            if 'bold' in item.keys():
                if item['bold'] == 1:
                    run.bold = True
            # 字体
            if 'name' in item.keys():
                if item['name'] is not None:
                    run.font.name = item['name']
            # 字体大小
            if 'fontSize' in item.keys():
                if item['fontSize'] is not None:
                    footsie = str(item['fontSize']).replace('p', '').replace('x', '')
                    run.font.size = int(footsie) * 10000

    doc.save(path)

创建 xlsx 文档

import json
import xlwt


# 为样式创建背景
def get_background(col):
    # 初始背景图案
    bg = xlwt.Pattern()

    if 'backgroundStyle' in col:
        if col['backgroundStyle'] is None:
            bg.pattern = xlwt.Pattern.NO_PATTERN
            return bg

        # May be: NO_PATTERN, SOLID_PATTERN, or 0x00 through 0x12
        if col['backgroundStyle'] is not None:
            # 无背景颜色
            if col['backgroundStyle'] == '0':
                bg.pattern = xlwt.Pattern.NO_PATTERN
            # 有背景颜色
            if col['backgroundStyle'] == '1':
                bg.pattern = xlwt.Pattern.SOLID_PATTERN
                # May be: 8 through 63.
                # 0 = Black,
                # 1 = White,
                # 2 = Red,
                # 3 = Green,
                # 4 = Blue,
                # 5 = Yellow,
                # 6 = Magenta,
                # 7 = Cyan,
                # 16 = Maroon,
                # 17 = Dark Green,
                # 18 = Dark Blue,
                # 19 = Dark Yellow , almost brown),
                # 20 = Dark Magenta,
                # 21 = Teal,
                # 22 = Light Gray,
                # 23 = Dark Gray
                # 设置背景颜色
                if 'backgroundColor' in col:
                    if col['backgroundColor'] is not None:
                        bg.pattern_fore_colour = convertColor(col['backgroundColor'])

    return bg


# 为样式创建字体
def get_font(col):

    # 初始化字体相关
    font = xlwt.Font()

    if 'fontName' in col:
        if col['fontName'] is not None:
            font.name = col['fontName']

    if 'fontBold' in col:
        if col['fontBold'] is not None:
            if col['fontBold'] == '0':
                font.bold = False
            if col['fontBold'] == '1':
                font.bold = True

    if 'fontColor' in col:
        if col['fontColor'] is not None:
            font.colour_index = convertColor(col['fontColor'])

    if 'fontSize' in col:
        if col['fontSize'] is not None:
            # 字体大小,11为字号,20为衡量单位
            font.height = 20 * int(col['fontSize'])

    # 下划线
    if 'underline' in col:
        if col['underline'] is not None:
            if col['underline'] == '0':
                font.underline = True
            if col['underline'] == '1':
                font.underline = True

    # 斜体字
    if 'italic' in col:
        if col['italic'] is not None:
            if col['italic'] == '0':
                font.italic = True
            if col['italic'] == '1':
                font.italic = True

    return font


# 设置单元格对齐方式
def get_alignment(col):
    alignment = xlwt.Alignment()

    # 0x01(左端对齐)、0x02(水平方向上居中对齐)、0x03(右端对齐)
    if 'horz' in col:
        if col['horz'] is not None:
            if col['horz'] == '0':
                alignment.horz = 0x01
            if col['horz'] == '1':
                alignment.horz = 0x02
            if col['horz'] == '2':
                alignment.horz = 0x03

    # 0x00(上端对齐)、 0x01(垂直方向上居中对齐)、0x02(底端对齐)
    if 'vert' in col:
        if col['vert'] is not None:
            if col['vert'] == '0':
                alignment.vert = 0x00
            if col['vert'] == '1':
                alignment.vert = 0x01
            if col['vert'] == '2':
                alignment.vert = 0x02

    # 设置自动换行
    if 'wrap' in col:
        if col['wrap'] is not None:
            if col['wrap'] == '0':
                alignment.wrap = 0
            if col['wrap'] == '1':
                alignment.wrap = 1

    return alignment


# 设置边框
def get_borders(col):
    borders = xlwt.Borders()

    # 细实线:1,小粗实线:2,细虚线:3,中细虚线:4,大粗实线:5,双线:6,细点虚线:7 大粗虚线:8,细点划线:9,粗点划线:10,细双点划线:11,粗双点划线:12,斜点划线:13
    # 大粗虚线:8,细点划线:9,粗点划线:10,细双点划线:11,粗双点划线:12,斜点划线:13
    if 'left' in col:
        if col['left'] is not None:
            borders.left = int(col['left'])
    if 'right' in col:
        if col['right'] is not None:
            borders.right = int(col['right'])
    if 'top' in col:
        if col['top'] is not None:
            borders.top = int(col['top'])
    if 'bottom' in col:
        if col['bottom'] is not None:
            borders.bottom = int(col['bottom'])

    if 'leftColor' in col:
        if col['leftColor'] is not None:
            borders.left_colour = convertColor(col['leftColor'])
    if 'rightColor' in col:
        if col['rightColor'] is not None:
            borders.right_colour = convertColor(col['rightColor'])
    if 'topColor' in col:
        if col['topColor'] is not None:
            borders.top_colour = convertColor(col['topColor'])
    if 'bottomColor' in col:
        if col['bottomColor'] is not None:
            borders.bottom_colour = convertColor(col['bottomColor'])

    return borders


# 设置颜色类型转换 # 16 进制 -> 0x 16 进制 -> Final Index
def convertColor(color):
    colorRGB = hexToRgb(color)
    return colorHandler(colorRGB[0], colorRGB[1], colorRGB[2])


# 颜色转换 # 16 进制 -> RGB
def hexToRgb(value):
    value = value.lstrip('#')
    lv = len(value)
    return tuple(int(value[i:i + lv // 3], 16) for i in range(0, lv, lv // 3))


# 颜色转换 RGB -> Final Index
def colorHandler(R, G, B):
    # 判断该颜色是否是黑色、白色、灰色
    if R == G == B:
        if R == 0 & G == 0 & B == 0:
            # 黑色
            return 0
        if R == 255 & G == 255 & B == 255:
            # 白色
            return 1
        if ((R >= 0) | (R <= 255)) \
                & ((G >= 0) | (G <= 255)) \
                & ((B >= 0) | (B <= 255)):
            # 灰色
            return 23
    # 判断该颜色是否是红色
    if R >= G == B:
        if ((R >= 255 - 80) | (R <= 255)) \
                & ((G >= 0) | (G <= 80)) \
                & ((B >= 0) | (B <= 80)):
            # 红色 Red
            return 2
    # 判断该颜色是否是橙色
    if R >= G >= B:
        if ((R >= 255 - 80) | (R <= 255)) \
                & ((G >= 125 - 40) | (G <= 125 + 40)) \
                & ((B >= 0) | (B <= 80)):
            # 橙色 Magenta
            return 6
    # 判断该颜色是否是黄色
    if R == G >= B:
        if ((R >= 255 - 80) | (R <= 255)) \
                & ((G >= 255 - 80) | (G <= 255)) \
                & ((B >= 0) | (B <= 80)):
            # 黄色 Yellow
            return 5
    # 判断该颜色是否是绿色
    if R <= G >= B:
        if ((R >= 0) | (R <= 80)) \
                & ((G >= 255 - 80) | (G <= 255)) \
                & ((B >= 0) | (B <= 80)):
            # 绿色 Green
            return 3
    # 判断该颜色是否是青色
    if R <= G == B:
        if ((R >= 0) | (R <= 80)) \
                & ((G >= 255 - 80) | (G <= 255)) \
                & ((B >= 255 - 80) | (B <= 255)):
            # 青色 Cyan
            return 7
    # 判断该颜色是否是蓝色
    if R <= G <= B:
        if ((R >= 0) | (R <= 80)) \
                & ((G >= 0) | (G <= 80)) \
                & ((B >= 255 - 80) | (B <= 255)):
            # 蓝色 Blue
            return 4
    # 判断该颜色是否是紫色
    if R >= G <= B:
        if ((R >= 255 - 80) | (R <= 255)) \
                & ((G >= 0) | (G <= 80)) \
                & ((B >= 255 - 80) | (B <= 255)):
            # 紫色 Maroon
            return 16

    return 0


def createExcel(text, path):
    # 创建一个excel
    excel = xlwt.Workbook()

    sheets = json.loads(text)

    for sheet in sheets:
        # 添加工作区
        if sheet['sheetName'] is not None:
            she = excel.add_sheet(sheet['sheetName'])
            # 从工作区中取出行
            if sheet['pyExcelRowEntityList'] is not None:
                for index, row in enumerate(sheet['pyExcelRowEntityList']):
                    # 从行中取出列
                    if row['pyExcelColEntities'] is not None:
                        for i, col in enumerate(row['pyExcelColEntities']):
                            # 解析样式
                            col_style = xlwt.XFStyle()
                            col_style.font = get_font(col)
                            col_style.pattern = get_background(col)
                            col_style.alignment = get_alignment(col)
                            col_style.borders = get_borders(col)
                            # 写入文档
                            she.write(index, i, col['text'], col_style)

    # 保存excel
    excel.save(path)

关于系统参数的处理

import io
import sys

# 设置文本编码格式
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')

# 获取外部参数
path = sys.argv[1]
jsonStr = sys.argv[2]

# 处理 JSON 参数
jsonStr = jsonStr.replace(',', '","')
jsonStr = jsonStr.replace(':', '":"')
jsonStr = jsonStr.replace('{', '{"')
jsonStr = jsonStr.replace('}', '"}')
jsonStr = jsonStr.replace('"[', '[')
jsonStr = jsonStr.replace(']"', ']')
jsonStr = jsonStr.replace('}","{', '},{')

createExcel(jsonStr, path)
  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值