将包含所有段落的文本以及相应的图像链接,链接按照在原文档中的位置爬取
from lxml import etree
html_doc = """
<div id="solution-text" class="max-w-full w-full p-2 lg:px-4 space-y-2 rounded overflow-x-auto ease-[0.5s] bg-body-bg">
<h2 class="font-bold text-lg text-green-600 mt-4 inline-flex items-center flex-wrap gap-2">
<span>The correct Answer is:</span>
<span class="math inline-block !w-full overflow-x-auto overflow-y-hidden text-lg">
<span>D</span>
</span>
</h2>
<hr class="my-4"/>
<h2 class="text-base font-normal my-2">
<span class="math inline-block !w-full overflow-x-auto overflow-y-hidden ">
<span>
Graph of
<span class="mjx-chtml">
<span class="mjx-math">
<span class="mjx-mrow">
</span>
</span>
</span>
intersect at three points. So, number of solution is 3 <br>
<img style="background:white;" class="mx-auto" width="80%" src="https://d10lpgp6xz60nq.cloudfront.net/physics_images/CEN_TRI_C07_E10_002_S01.png" loading="lazy" alt="ocr_image">
<br>
(b) Graph of
<span class="mjx-chtml">
<span class="mjx-math">
<span class="mjx-mrow">
<span class="mjx-mstyle">
<span class="mjx-mrow">
<span class="mjx-mi">
<span style="padding-top: 0.225em; padding-bottom: 0.519em; padding-right: 0.006em;" class="mjx-char MJXc-TeX-math-I">y</span>
</span>
</span>
</span>
</span>
</span>
</span>
interect at four points. So, number of solution is 4. <br>
<img style="background:white;" class="mx-auto" width="80%" src="https://d10lpgp6xz60nq.cloudfront.net/physics_images/CEN_TRI_C07_E10_002_S02.png" loading="lazy" alt="ocr_image">
<br>
"""
# 解析HTML文档
root = etree.HTML(html_doc)
# 初始化结果字符串
result_string = ""
# 使用XPath获取所有<h2>标签
paragraphs = root.xpath('//div[@id="solution-text"]/h2')
# 遍历每个<h2>标签
for idx, paragraph in enumerate(paragraphs):
# 初始化当前段落的文本内容
current_paragraph_text = ""
# 获取段落内的所有<img>标签的src属性值
image_links = paragraph.xpath('.//img/@src')
# 遍历段落中的所有节点
for node in paragraph.iter():
if node.tag == 'img':
# 记录图片链接
current_paragraph_text += f"\nImage link: {node.get('src')}"
elif node.text and node.text.strip():
# 累积文本
current_paragraph_text += node.text.strip()
# 将当前段落的文本内容添加到结果字符串中
result_string += f"{current_paragraph_text}\n"
# 去除末尾的换行符
result_te = ''.join(result_string).strip()
# 输出格式化的结果
print({'result_te': result_te})