python识别颜色1007python识别颜色_python读取word文档识别字段颜色，解析字段

最新推荐文章于 2023-05-25 01:15:04 发布

xuliagn

最新推荐文章于 2023-05-25 01:15:04 发布

阅读量227

点赞数

文章标签： python识别颜色1007python识别颜色

本文链接：https://blog.csdn.net/weixin_42544838/article/details/114463093

版权

python版本3.7.3，读取的文档格式为.docx

文中带有简单注释

看不懂的百度网盘下载直接查看，更改运行里面的py文件

提取码：nngw

import os

import sys

import xlrd

import codecs

import collections

import json

import io

import docx

import string

from docx import Document

from docx.shared import RGBColor #这个是docx的颜色类

maxLength = 0

id = 1

convert_list = []

type_list = []

curPath = os.path.dirname(os.path.abspath(__file__))

# coding=utf-8

#获取文档对象

def readDocx(fileName,type):

xlsFile = curPath + '\\'+fileName+'.docx' #地理(葡)Respueda G .es.pt

print("xlsFile: "+xlsFile)

file=docx.Document(xlsFile)

# print("段落数:"+str(len(file.paragraphs)))

index = 0

data = {}

i = 0

global id

global maxLength

for p in file.paragraphs:

i = i + 1

if i <= 1: #跳过第一行

continue

if p.text == "" or (not p.text.strip()):

continue

# print("读取第 "+str(i)+" 行，文件名："+fileName+" ID："+str(id)+" 内容:"+p.text)

if index == 0: #提取题目

# print(p.text.find("-"),"题的内容是：", p.text)

length = len(p.text)

idx = p.text.find("Número")

if idx != -1 and idx < 2:

idx = idx + len("Número") + 1

# print("Número: "+str(idx)+" text: "+p.text)

p.text = p.text[idx:(length)]

# print("Número: "+str(idx)+" text: "+p.text)

indexStr = "-" #分隔符

if p.text.find(indexStr) == -1:

indexStr = "."

if p.text.find(indexStr) == -1:

indexStr = " "

# print("题的内容是：", p.text)

idx = p.text.index(indexStr)+len(indexStr)

length = len(p.text)

if length > maxLength:

maxLength = length

# print(id,"最大字符数",maxLength)

# print(str(idx)+str(length)+"第"+str(id)+"题的内容是："+p.text)

questionAndsubType = p.text[idx:(length)]

questionAndsubTypeList = questionAndsubType.split("|")

data["question"] = questionAndsubTypeList[0] #题目

# if len(questionAndsubTypeList) > 1 : #类型

# subType = questionAndsubTypeList[1].replace("\n", "")

# print("---类型---",type_list.count(subType))

# if type_list.count(subType) <= 0 :

# type_list.append(subType)

data["subType"] = type#escape(subType) #类型

else: #提取选项，以及正确答案

# print("第"+str(id)+"题选项"+ str(index) +"是："+p.text)

length = len(p.text)

for n in p.runs:

rgb = str(n.font.color.rgb) #读取段落颜色

# print("runs"+rgb)

if rgb == "00FF00":

# print("正确答案： ",index)

data["rightIndex"] = index

#删除段落中不必要文字

idx = p.text.find("(Direito)")

if idx != -1:

p.text = p.text[0:idx]

idx = p.text.find("(Correcta)")

if idx != -1:

p.text = p.text[0:idx]

idx = p.text.find("(Right)")

if idx != -1:

p.text = p.text[0:idx]

idx = p.text.find("(Correct)")

if idx != -1:

p.text = p.text[0:idx]

#删除段落中不必要文字

data["option"+str(index)] = p.text

index = index + 1

if index >= 5:

data["_id"] = id

# print("data: "+str(data))

convert_list.append(data)

index = 0

id = id + 1

data = {}

def writeDocx(fileList,name):

global id

global convert_list

global type_list

id = 1

convert_list = []

type_list = []

for p in fileList:

readDocx(p["path"],p["type"])

#题库

jsonPath = os.path.join(curPath,"topic",name+".txt") #写入路径

dirname = os.path.dirname(jsonPath)

if not os.path.exists(dirname):

os.makedirs(dirname)

with io.open(jsonPath, 'w', encoding='utf-8') as f: #按照对应路径写入

f.write(json.dumps(convert_list, ensure_ascii=False, indent=4, sort_keys=True))

def main():

en_fileList = [{"path":"en_us_topic\\地理(英)Respueda G .es.en", "type":"World"},

{"path":"en_us_topic\\科学与技术(英)", "type":"Technology"},

{"path":"en_us_topic\\历史(英)Resupeda H.es.en", "type":"History"},

{"path":"en_us_topic\\艺术和文学(英)Respueda A&L.es.en", "type":"ArtAndLiterature"},

{"path":"en_us_topic\\娱乐(英)Respueda E.es.en", "type":"Fashion"},

{"path":"en_us_topic\\运动(英)Respueda D.es.en", "type":"Sports"}]

en_name = "en_us_topic"

es_fileList = [{"path":"es_es_topic\\地理(西)Respueda G ", "type":"World"},

{"path":"es_es_topic\\科学与技术(西)Respueda C&T", "type":"Technology"},

{"path":"es_es_topic\\历史(西)Resupeda H", "type":"History"},

{"path":"es_es_topic\\艺术和文学(西)Respueda A&L", "type":"ArtAndLiterature"},

{"path":"es_es_topic\\娱乐(西)Respueda E", "type":"Fashion"},

{"path":"es_es_topic\\运动(西)Respueda D", "type":"Sports"}]

es_name = "es_es_topic"

pt_fileList = [{"path":"pt_br_topic\\地理(葡)Respueda G .es.pt", "type":"World"},

{"path":"pt_br_topic\\科学与技术(葡)", "type":"Technology"},

{"path":"pt_br_topic\\历史(葡)Resupeda H.es.pt", "type":"History"},

{"path":"pt_br_topic\\艺术和文学(葡)Respueda A&L.es.pt", "type":"ArtAndLiterature"},

{"path":"pt_br_topic\\娱乐(葡)Respueda E.es.pt", "type":"Fashion"},

{"path":"pt_br_topic\\运动(葡)Respueda D.es.pt", "type":"Sports"}]

pt_name = "pt_br_topic"

writeDocx(pt_fileList,pt_name)

writeDocx(es_fileList,es_name)

writeDocx(en_fileList,en_name)

main()

xuliagn

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
python识别颜色1007python识别颜色_python读取word文档识别字段颜色，解析字段

python版本3.7.3，读取的文档格式为.docx文中带有简单注释看不懂的百度网盘下载直接查看，更改运行里面的py文件提取码：nngwimport osimport sysimport xlrdimport codecsimport collectionsimport jsonimport ioimport docximport stringfrom docx import Documentf...
复制链接

扫一扫