from scrapy.selector import Selector
text = '''
<html><body>
<div>
<ul>
<li class="item-0"><a href="link1.html">first</a></li>
<li class="item-1"><a href="link2.html">second</a></li>
<li class="item-2"><a href="link3.html">third</a></li>
<li class="item-3"><a href="link4.html">fourth</a></li>
</ul>
</div>
</body></html>
'''
res = Selector(text=text)
父节点 …
# 获取href属性为link2.html的a标签的父节点的class名
result = res.xpath('//a[@href="link2.html"]/../@class')
print(result)
# ['item-1']
获取属性值
# 获取所有li的子节点a的属性href
result = html.xpath('//li/a/@href')
print(result)
# ['link1.html', 'link2.html', 'link3.html', 'link4.html']
属性多值匹配
使用contains函数匹配
text = '''
<li class="li li-first"><a href="link.html">first item</a></li>
'''
html = Selector(text=text)
# get()方法才有关键词default getall()没有
result = html.xpath('//li[@class="li"]/a/text()').get('no content')
# 'no content'
result = html.xpath('//li[contains(@class, "li")]/a/text()').get()
# 'first item'
多属性匹配
需要匹配满足多个属性的节点,使用 and 运算符
text = '''
<li class="li li-first" name="item"><a href="link.html">first item</a></li>
'''
html = Selector(text=text)
# ['first item']
# 通过class和name两个属性进行匹配
result = html.xpath('//li[contains(@class, "li") and @name="item"]/a/text()').get()
# 'first item'
xpath的运算符介绍
按序选择
根据节点所在的顺序进行提取
text = '''
<html><body>
<div>
<ul>
<li class="item-0"><a href="link1.html">first</a></li>
<li class="item-1"><a href="link2.html">second</a></li>
<li class="item-2"><a href="link3.html">third</a></li>
<li class="item-3"><a href="link4.html">fourth</a></li>
</ul>
</div>
</body></html>
'''
html = Selector(text=text)
# 按索引排序 xpath中索引是从 1 开始 另外注意 li[1] 与 (li)[1] 的区别:
# 后者只选取所有满足条件的li第一个, 前者每找到一个满足条件的节点li,就返回第一个
result = html.xpath('//li[1]/a/text()').get()
# 'first'
# last 最后一个
result = html.xpath('//li[last()]/a/text()').get()
# 'fourth'
# position 位置查找
result = html.xpath('//li[position()<3]/a/text()').getall()
# ['first', 'second']
# - 运算符
result = html.xpath('//li[last()-2]/a/text()').get()
# 'second'
节点轴选择
# 所有祖先节点
result = html.xpath('//li[1]/ancestor::*')
# [<Selector xpath='//li[1]/ancestor::*' data='<html><body>\n<div>\n <ul>\n <li '>,
# <Selector xpath='//li[1]/ancestor::*' data='<body>\n<div>\n <ul>\n <li class='>,
# <Selector xpath='//li[1]/ancestor::*' data='<div>\n <ul>\n <li class="item-0'>,
# <Selector xpath='//li[1]/ancestor::*' data='<ul>\n <li class="item-0"><a href='>]
## 返回了四个父节点 <html> <body> <div> <ul>
# 祖先节点中的div
result = html.xpath('//li[1]/ancestor::div')
# [<Selector xpath='//li[1]/ancestor::div' data='<div>\n <ul>\n <li class="item-0'>]
# 节点的所有属性
result = html.xpath('//li[1]/attribute::*').getall()
# ['item-0']
# 子节点
result = html.xpath('//li[1]/child::a[@href="link1.html"]').get()
# '<a href="link1.html">first</a>'
# 后代节点中的a
result = html.xpath('//li[1]/descendant::a').getall()
# ['<a href="link1.html">first</a>']
# 该节点后面所有节点中的第2个 从1开始计数
result = html.xpath('//li[1]/following::*[2]').getall()
# ['<a href="link2.html">second</a>']
# 该节点后面的所有兄弟节点
result = resp.xpath('//ul/li[contains(@class,"0")]/following-sibling::li/a/text()').getall()
# ['second', 'third', 'fourth']
附一些相关学习链接:
选择器小结:正则表达式、XPath选择器、CSS选择器小结和使用场景
python爬虫:scrapy框架xpath和css选择器语法