最近做了一个“浙江省高中信息技术高考题库系统”,后台用django框架,前端vue.js。其中有个功能模块就是把练习题目docx格式,批量导入数据库。具体做法就是前端上传word版的题目,后台批量逐段读取里面的题目,包括图片。
网络上搜索到的大部分的一次性读取docx中的所有图片。这样在我系统里行不通,因为我需要某段的图片属于哪一题的。图片要跟相应的题目一起写入数据库。
先上图吧
最终的目标就是把题目的内容、答案、解析、图片、题型等提取出来,其中图片以base64格式编码成字符,统一json格式,把多个题目的信息返回给前端确认题目时候识别正确,以列表形式返回给前端;形式如下:
[{“content”:"按照二叉树的定义,具有3个节点的二叉树形态有( )A.3种B.4种C.5种D.6种","answer":"C","explain":"略","type":"选择题","pictures":"data:image/jpeg;base64,/9j/4AAQSkZJRgABAQEAwADAAAD/2wBDAAIBAQIBAQICAgICAgICAwUDAwMDAwYEBAMFBwYHBwcGBwcICQsJCAgKCAcHCg0KCgsMDAwMBwkODw0MDgsMDAz/2wBDAQICAgMDAwYDAwYMCAcIDAwMDAwMDAwMDAwMDAwMDAw"}]
因为题库系统中题目的模型如下:
class Questions(models.Model):
QUESTION_TYPE_CHOICES = (
('选择题','选择题'),
('填空题','填空题'),
('简答题','简答题')
)
DIFFICULTY_LEVEL_CHOICES = (
('入门', '入门'),
('中级', '中级'),
('高级', '高级')
)
content = models.CharField(max_length=1000,verbose_name='内容')
# overview = models.CharField(max_length=100,verbose_name='内容概述,取内容的前25个字符',default='')
answer = models.CharField(max_length=200,verbose_name='正确答案')
explain = models.CharField(max_length=200,verbose_name='解析')
reference = models.CharField(max_length=20,verbose_name='引用哪里',help_text="比如引用5年高考3年模拟,写作53")
type = models.CharField(verbose_name='题型',max_length=10, choices=QUESTION_TYPE_CHOICES)
difficulty_level = models.CharField(verbose_name='难度', max_length=10, choices=DIFFICULTY_LEVEL_CHOICES,default=DIFFICULTY_LEVEL_CHOICES[0][0])
owner = models.ForeignKey(User, verbose_name='录入者',on_delete=models.CASCADE,related_name="questions")
add_time = models.DateTimeField(auto_now_add=True, verbose_name='录入时间')
last_edit_time = models.DateTimeField(auto_now_add=True, verbose_name='修改时间')
open_level = models.CharField(verbose_name='开放程度',choices=OPEN_LEVEL_CHOICES, max_length=10, default=OPEN_LEVEL_CHOICES[0][0])
#tags = TaggableManager()
top = models.BooleanField(default=False, verbose_name='置顶')
knowledgepoint = models.ForeignKey(KnowledgePoint,on_delete=models.CASCADE,verbose_name='知识点', related_name="questions")
order = OrderField(blank=True, for_fields=['knowledgepoint'], verbose_name='排序序号',
help_text="决定各节之间的排序序号,升序排序比如第一章为1,第二章为2;1.1为1;1.3为了3")
pictures = models.CharField(max_length=200000, blank=True)
class Meta:
verbose_name = '题目'
verbose_name_plural = verbose_name
ordering = ['order']
def __str__(self):
return self.content[len(self.content)//10]
前端确认无误或者修改后,把所有题目信息的列表重新post给后端,写入数据库
言归正传:其中如何逐段读取word内的图片
多说无益,先附上代码:
import docx
from docx.document import Document
from docx.text.paragraph import Paragraph
from docx.image.image import Image
from docx.parts.image import ImagePart
from docx.oxml.shape import CT_Picture
from PIL import Image
from io import BytesIO
import sys
def get_picture(document: Document, paragraph:Paragraph):
"""
document 为文档对象
paragraph 为内嵌图片的段落对象,比如第1段内
"""
result_list=[]
img_list = paragraph._element.xpath('.//pic:pic')
if len(img_list)==0 or not img_list:
return
for i in range(len(img_list)):
img: CT_Picture = img_list[i]
embed = img.xpath('.//a:blip/@r:embed')[0]
related_part: ImagePart = document.part.related_parts[embed]
image: Image = related_part.image
result_list.append(image)
return result_list
if __name__ =="__main__":
d = docx.Document('test.docx')
for i in range(len(d.paragraphs)):
paragraph = d.paragraphs[i]
image_list = get_picture(d, paragraph)
if not image_list:
continue
for image in image_list:
if image:
# 后缀
ext = image.ext
# 二进制内容
blob = image.blob
# 显示图片
Image.open(BytesIO(blob)).show()
效果如下:依次逐行读每张图片,并显示出来。这里只是测试,跟项目结合读取题目内容的部分空,下次再发出来
解说下源代码的关键地方:
import docx
from docx.document import Document #读取word的docx文件
from docx.text.paragraph import Paragraph #读取段落
from docx.image.image import Image #读取图片
for i in range(len(d.paragraphs)):
paragraph = d.paragraphs[i] #逐行读取第i自然段
img_list = paragraph._element.xpath('.//pic:pic') #读取该自然段内的所有图片,返回列表
for i in range(len(img_list)):
img: CT_Picture = img_list[i]
embed = img.xpath('.//a:blip/@r:embed')[0] #读取该自然段内的内嵌图片
if image:
# 后缀
ext = image.ext
# 二进制内容
blob = image.blob #blob方法获取二进制流文件
# 显示图片
Image.open(BytesIO(blob)).show() #BytesIO 转成字节流显示出来
本文章引用以下: