python 从docx文件中读取文字和图片，其中图片编码成base64格式（高中信息技术题库系统）

本文链接：https://blog.csdn.net/zhuwoqing/article/details/124905091

本文介绍如何使用Python读取.docx文档，通过while循环和条件判断，将每段自然内容转化为结构化的字典，适合处理填空题，如质因数分解实例。通过get_content函数处理段落，提取题目、答案和解释，适用于教育或编程教程解析。

摘要由CSDN通过智能技术生成

网络上读取docx文本的文章很多，但如何把每一自然段，逐一的存入相应的字典Key:value中；非常繁琐，需要把逻辑理的很清楚。

再一次上我的需求和方案。

最终读取后形成如下格式：

[{
"content": "输入一个正整数，输出所有的质因子。如24＝2*2*2*3。实现上述功能的Python代码如下：\n\n\n\nn＝int(input(″输入一个正整数：″))\n\n\n\ni＝2\n\n\n\nwhile______①______：\n\n\n\n　if n % i＝＝0：\n\n\n\nn＝n/i\n\n\n\nprint(i)\n\n\n\n　else：\n\n\n\n______②______\n\n\n\n(1)在程序划线处填入合适的代码。\n\n\n\n(2)按照上述算法，输入60，依次输出的质因子是____________。",
"answer": "(1)①n>1或n! ＝1　②i＋＝1　(2)2 2 3 5",
"explain": "最小的质因数是2，如果能被2整除，则反复相除，当不能被2整除时，将i增加1，尝试被3整除，如果还不能除通，往上增加到4，由于前面反复除2操作，因此不可能被不是质数的数除通。当相除的结果为1时，终止循环。输入60，可以被2除2次，被3除1次，被5除1次。",
"reference": 53,
"type": "填空题",
"difficulty_level": "中级",
"knowledgepoint": 11,
"open_level": "public",
"tags": "while循环",
"top": false
},]

from email import contentmanager
import imp
import docx
from docx.document import Document
from docx.text.paragraph import Paragraph
from docx.image.image import Image
from docx.parts.image import ImagePart
from docx.oxml.shape import CT_Picture
from PIL import Image
from io import BytesIO
import sys
import base64
import struct

question_number_list = [str(i)+"." for i in range(1,31)]
# 建立题型选择器
text_head_filling = "、填空题"
text_head_choice = "、选择题"
text_head_SQAS = "、简答题"
type_dict = {text_head_filling:text_head_filling[1::],text_head_choice:text_head_choice[1::],
            text_head_SQAS: text_head_SQAS[1::]}
def get_picture(document: Document, paragraph:Paragraph):
    """
	document 为文档对象
	paragraph 为内嵌图片的某一个段落对象，比如第1段内
    """
    result_list=[]
    img_list = paragraph._element.xpath('.//pic:pic')
    if len(img_list)==0 or not img_list:
        return
    for i in range(len(img_list)):
        img: CT_Picture = img_list[i]
        embed = img.xpath('.//a:blip/@r:embed')[0]
        related_part: ImagePart = document.part.related_parts[embed]
        image: Image = related_part.image
        result_list.append(image)
    return result_list


def get_content(paragraph:Paragraph):
    """
	paragraph 为内嵌图片的某一个段落对象，比如第1段内
	return 字典形式的题目比如[{“content”:"按照二叉树的定义,具有
	3个节点的二叉树形态有(　)A.3种B.4种C.5种D.6种","answer":"C",
	"explain":"略","type":"选择题"}
    """
    new_key = ""         #存放当前的字段名；
    new_type = ""       #存放新题型
    new_question = False    #存放当前自然段是否是新题目的开始
    
    # 设置题型
    paragraph_text = paragraph.text
    # 去掉空行
    if not paragraph_text.strip():
        return False,"","",""
    # 去掉插图的文字提示
    if paragraph_text.strip()[:1:]=="第":
        if paragraph_text.strip()[3:5]=="题图" or paragraph_text.strip()[2:4]=="题图":
            return False,"","",""   
    # 设置内容
    content = ""
    content_start = 0
    answer=""
    # 如果当前是题型的开头，即一、选择题.二、填空题或者三、简答题.等开头的，则跳过并设置题目内容的开始位置；           
    if paragraph_text[1:5:] in type_dict.keys():
        temp_type = type_dict[paragraph_text[1:5:]]
        return False,temp_type,"","" 
    # 如果当前是题目的首行，即1.2.等开头的，则跳过并设置题目内容的开始位置；
    position = paragraph_text.find(".")
    if position >0:
        if paragraph_text[position-1:position+1:1] in question_number_list or paragraph_text[position-2:position+1:1] in question_number_list:
            content_start = position+1
            new_question = True
            new_key = "content"
    # 当前段落是答案开头，则取该段落除【答案】或者【解析】字样
    position_answer = paragraph_text.strip().find("【答案】")
    position_explain = paragraph_text.strip().find("【解析】")
    # 该段为答案段
    if position_answer>=0:
        new_key = "answer"
        content_start= position_answer+5
    # 该段为解释
    elif position_explain>=0:
        new_key = "explain"
        content_start = position_explain+4
    
    content = paragraph_text[content_start::]
    return new_question,new_type,new_key,content
             

def ReadDocx2List(d : Document):    
    start_row = 0
    paragraph_text=""
    current_key = "content"
    current_type = "选择题"     #当前的题型
    questions_list = []          #存放所有题目
    new_question = False
    question_dict={"content":"","answer":"","explain":"","type":"","pictures":""}
    data_list=list()
    first = True
    #从第一题开始收集题目
    for start_row in range(len(d.paragraphs)):
        paragraph = d.paragraphs[start_row]
        if paragraph.text.strip()[1:5:] in type_dict.keys():
            break
    # 读取图片
    for i in range(start_row,len(d.paragraphs)):
        paragraph = d.paragraphs[i]        
        image_list = get_picture(d, paragraph)
        if image_list:
            for image in image_list:
                if image:
                    # 后缀
                    ext = image.ext
                    # 二进制内容
                    blob = image.blob
                    # 显示图片
                    Image.open(BytesIO(blob)).show()
                    img_stream = base64.b64encode(blob)
                    bs64 = "data:image/jpeg;base64," + img_stream.decode('utf-8')
                    if question_dict["pictures"] == "":
                        question_dict["pictures"] = bs64
                    else:
                        question_dict["pictures"] += "-" + bs64
                    print(bs64)
            continue
        
        # 获取所有的文本内容        
        new_question,temp_type,temp_key,result_text = get_content(paragraph)
        print(result_text)
        #题型的转换，只需要更改题型的key，不需要做做其他的任何操作
        if temp_type !="":            
            #设置在此行以后的题型
            current_type = temp_type            
            current_key="content"   #初始化从内容开始存放,可能跟新题首行的设置重复
            continue
        else:
            #有新的key，即有新内容
            if temp_key != "":
                #如果是题目的首行，即出现新题目，则分为第1个题目，还是非第1题
                if new_question:
                    # 如果是第一题，则只需要把该段文本存入content字段，非设置变量first为非第一题
                    if first:
                        first = False
                        question_dict['type'] = current_type
                        question_dict[current_key] = result_text + "\n"
                    # 当前非第1题，先结算上一题到questions_list中，再初始化题目字典，并把当前行文本添加到content字段中
                    else:                        
                        questions_list.append(question_dict)
                        question_dict={"content":"","answer":"","explain":"","type":current_type,"pictures":""}
                        current_key="content"       #初始化从内容开始存放                        
                        question_dict[current_key] += result_text + "\n" 
                # 非题目的首段，且有新的key，比如遇到答案或解析，设置新的key，并把内容添加到给字典key的value中
                else:
                    current_key=temp_key
                    question_dict[current_key] +=result_text+ "\n"
            # 没有新的key，且有内容，意味着是当前Key的换行，比如题目多自然段，答案或者解析的换行多自然段
            elif result_text !="":
                question_dict[current_key] +=result_text+ "\n"
    questions_list.append(question_dict)
    return questions_list
    # print(questions_list)
if __name__ =="__main__":
    d = docx.Document('test.docx')
    data_list = ReadDocx2List(d)
    print(data_list)

代码中的注释比较详细了。有疑问的请请留言，互相探讨