- Xpath删除指定标签
# 过程:
# 1.匹配到指定标签
# 2.根据表属性删除
scripts = html.xpath('//script')
for s in scripts:
s.getparent().remove(s)
- Xpath删除指定标签属性
#过程:
# 1.匹配到指定标签
# 2.根据strip_attributes方法删除
#strip_attributes 该方法是lxml中etree下的方法,主要是针对标签属性做更改,源码如下:
def strip_attributes(tree_or_element, *attribute_names): # real signature unknown; restored from __doc__
"""
strip_attributes(tree_or_element, *attribute_names)
Delete all attributes with the provided attribute names from an
Element (or ElementTree) and its descendants.
Attribute names can contain wildcards as in `_Element.iter`.
Example usage::
strip_attributes(root_element,
'simpleattr',
'{http://some/ns}attrname',
'{http://other/ns}*')
"""
pass
"""示例:"""
# 删除作者标签的href,a标签
user = html.xpath('//*[@class="authorName"]')
etree.strip_attributes(user[0], ["href"])
# 将a标签内的所有属性删除
etree.strip_attributes(user[0], "{}*")
- Xpath替换标签属性值
# 替换指定标签属性值
# 查找img标签
imgs = html.xpath('//*[@class="contentMedia contentPadding"]/div/div/img')
for i in imgs:
# 替换src属性值
i.attrib['src'] = "要替换的值"
- Xpath将etree转换后的页面再次转换为String
html_1 = requests.get(url).content.decode()
html = etree.HTML(html_1)
# 再次转换为String,tostring方法
html_str = etree.tostring(html, encoding="utf-8").decode("utf-8")
print(html_str)
后续不定期更新Xpath的非常用方法,谢谢阅读!!!!