python版本3.7.3,读取的文档格式为.docx
文中带有简单注释
![b850f0c0409628cb8cea648da2ee81c4.png](https://img-blog.csdnimg.cn/img_convert/b850f0c0409628cb8cea648da2ee81c4.png)
看不懂的百度网盘下载直接查看,更改运行里面的py文件
- 网盘下载
- 提取码:nngw
![9ddf768aa92e0de87c790e486563fc03.png](https://img-blog.csdnimg.cn/img_convert/9ddf768aa92e0de87c790e486563fc03.png)
import osimport sysimport xlrdimport codecsimport collectionsimport jsonimport ioimport docximport stringfrom docx import Documentfrom docx.shared import RGBColor #这个是docx的颜色类 maxLength = 0id = 1convert_list = []type_list = []curPath = os.path.dirname(os.path.abspath(__file__))# coding=utf-8#获取文档对象def readDocx(fileName,type): xlsFile = curPath + ''+fileName+'.docx' #地理(葡)Respueda G .es.pt print("xlsFile: "+xlsFile) file=docx.Document(xlsFile) # print("段落数:"+str(len(file.paragraphs))) index = 0 data = {} i = 0 global id global maxLength for p in file.paragraphs: i = i + 1 if i <= 1: #跳过第一行 continue if p.text == "" or (not p.text.strip()): continue # print("读取第 "+str(i)+" 行,文件名:"+fileName+" ID:"+str(id)+" 内容:"+p.text) if index == 0: #提取题目 # print(p.text.find("-"),"题的内容是:", p.text) length = len(p.text) idx = p.text.find("Número") if idx != -1 and idx < 2: idx = idx + len("Número") + 1 # print("Número: "+str(idx)+" text: "+p.text) p.text = p.text[idx:(length)] # print("Número: "+str(idx)+" text: "+p.text) indexStr = "-" #分隔符 if p.text.find(indexStr) == -1: indexStr = "." if p.text.find(indexStr) == -1: indexStr = " " # print("题的内容是:", p.text) idx = p.text.index(indexStr)+len(indexStr) length = len(p.text) if length > maxLength: maxLength = length # print(id,"最大字符数",maxLength) # print(str(idx)+str(length)+"第"+str(id)+"题的内容是:"+p.text) questionAndsubType = p.text[idx:(length)] questionAndsubTypeList = questionAndsubType.split("|") data["question"] = questionAndsubTypeList[0] #题目 # if len(questionAndsubTypeList) > 1 : #类型 # subType = questionAndsubTypeList[1].replace("", "") # print("---类型---",type_list.count(subType)) # if type_list.count(subType) <= 0 : # type_list.append(subType) data["subType"] = type#escape(subType) #类型 else: #提取选项,以及正确答案 # print("第"+str(id)+"题 选项"+ str(index) +"是:"+p.text) length = len(p.text) for n in p.runs: rgb = str(n.font.color.rgb) #读取段落颜色 # print("runs"+rgb) if rgb == "00FF00": # print("正确答案: ",index) data["rightIndex"] = index #删除段落中不必要文字 idx = p.text.find("(Direito)") if idx != -1: p.text = p.text[0:idx] idx = p.text.find("(Correcta)") if idx != -1: p.text = p.text[0:idx] idx = p.text.find("(Right)") if idx != -1: p.text = p.text[0:idx] idx = p.text.find("(Correct)") if idx != -1: p.text = p.text[0:idx] #删除段落中不必要文字 data["option"+str(index)] = p.text index = index + 1 if index >= 5: data["_id"] = id # print("data: "+str(data)) convert_list.append(data) index = 0 id = id + 1 data = {}def writeDocx(fileList,name): global id global convert_list global type_list id = 1 convert_list = [] type_list = [] for p in fileList: readDocx(p["path"],p["type"]) #题库 jsonPath = os.path.join(curPath,"topic",name+".txt") #写入路径 dirname = os.path.dirname(jsonPath) if not os.path.exists(dirname): os.makedirs(dirname) with io.open(jsonPath, 'w', encoding='utf-8') as f: #按照对应路径写入 f.write(json.dumps(convert_list, ensure_ascii=False, indent=4, sort_keys=True))def main(): en_fileList = [{"path":"en_us_topic地理(英)Respueda G .es.en", "type":"World"}, {"path":"en_us_topic科学与技术(英)", "type":"Technology"}, {"path":"en_us_topic历史(英)Resupeda H.es.en", "type":"History"}, {"path":"en_us_topic艺术和文学(英)Respueda A&L.es.en", "type":"ArtAndLiterature"}, {"path":"en_us_topic娱乐(英)Respueda E.es.en", "type":"Fashion"}, {"path":"en_us_topic运动(英)Respueda D.es.en", "type":"Sports"}] en_name = "en_us_topic" es_fileList = [{"path":"es_es_topic地理(西)Respueda G ", "type":"World"}, {"path":"es_es_topic科学与技术(西)Respueda C&T", "type":"Technology"}, {"path":"es_es_topic历史(西)Resupeda H", "type":"History"}, {"path":"es_es_topic艺术和文学(西)Respueda A&L", "type":"ArtAndLiterature"}, {"path":"es_es_topic娱乐(西)Respueda E", "type":"Fashion"}, {"path":"es_es_topic运动(西)Respueda D", "type":"Sports"}] es_name = "es_es_topic" pt_fileList = [{"path":"pt_br_topic地理(葡)Respueda G .es.pt", "type":"World"}, {"path":"pt_br_topic科学与技术(葡)", "type":"Technology"}, {"path":"pt_br_topic历史(葡)Resupeda H.es.pt", "type":"History"}, {"path":"pt_br_topic艺术和文学(葡)Respueda A&L.es.pt", "type":"ArtAndLiterature"}, {"path":"pt_br_topic娱乐(葡)Respueda E.es.pt", "type":"Fashion"}, {"path":"pt_br_topic运动(葡)Respueda D.es.pt", "type":"Sports"}] pt_name = "pt_br_topic" writeDocx(pt_fileList,pt_name) writeDocx(es_fileList,es_name) writeDocx(en_fileList,en_name) main()
有什么问题欢迎大家评论区留言讨论,都看到这了,别忘了点关注哦!