scrapy_selector_xpath基础语法

# 《精通 scrapy 网络爬虫》第3章 第3节(即3.3)xpath 实例

from scrapy.selector import Selector
from scrapy.http import HtmlResponse

body = '''
<html>
	<head>
		<base href='http://example.com'/>
		<title>Example website</title>
	</head>
	<body>
		<div id='images'>
			<a href='image1.html'>Name:Image 1 <br/><img src="image1.jpg"/></a>
			<a href='image2.html'>Name:Image 2 <br/><img src="image2.jpg"/></a>
			<a href='image3.html'>Name:Image 3 <br/><img src="image3.jpg"/></a>
			<a href='image4.html'>Name:Image 4 <br/><img src="image4.jpg"/></a>
			<a href='image5.html'>Name:Image 5 <br/><img src="image5.jpg"/></a>
		</div>
	</body>
</html>
'''
response = HtmlResponse(url='http://www.example.com/', body=body, encoding='utf-8')

# /: 一个从根开始的绝对路径
print(response.xpath('/html'))
print(response.xpath('/html/head'))
# E1/E2:选中E1节点中所有E2
print(response.xpath('/html/body/div/a'))
# //E: 选中文档中所有E,无论在什么位置
print(response.xpath('//a'))
# E1//E2:选中E1后代节点中所有E2,无论在后代中的什么位置
print(response.xpath('/html/body//img'))
print(response.xpath('/html/body//a'))
# E/text():选中E的文本子节点
print(response.xpath('//a/text()').extract())
# E/*:选中E的所有元素子节点
print(response.xpath('/html/*'))
print(response.xpath('//body/*'))
# */E:选中孙节点中的所有E
print(response.xpath('//div/*/img'))
# E/@ATTR: 选中E的ATTR属性
print(response.xpath('//img/@src'))
# //@ATTR: 选中文档中所有Attr属性
print(response.xpath('//@href'))
print(response.xpath('//@href').extract())
# E/@*:选中E的所有属性
print(response.xpath('//a[1]/img/@*'))
# .:选中当前节点,用来描述相对路径
sel = response.xpath('//a')[0]
print(sel)
print(sel.xpath('//img')) # //img 是绝对路径,从根开始搜索,不是从当前a开始
print(sel.xpath('.//img'))# .//img 描述当前节点后代中所有img
# ..: 选中当前节点的父节点,用来描述相对路径
print(response.xpath('..//img'))
# node[谓语]:用来查找某个特定的节点或者包含某个特定值的节点
# a中的第3 个
print(response.xpath('//a[3]'))
# last函数
print(response.xpath('//a[last()]'))
# position函数
print(response.xpath('//a[position()<=2]'))
# 选中所有含有id属性的div
print(response.xpath('//div[@id]'))
# id 属性值过滤
print(response.xpath('//div[@id="images"]'))
-----------------------------------
D:\Python\Python36\python.exe D:/Project0611/test01/scrapySelectorXpathTest.py
[<Selector xpath='/html' data='<html>\n\t<head>\n\t\t<base href="http://exam'>]
[<Selector xpath='/html/head' data='<head>\n\t\t<base href="http://example.com"'>]
[<Selector xpath='/html/body/div/a' data='<a href="image1.html">Name:Image 1 <br><'>, <Selector xpath='/html/body/div/a' data='<a href="image2.html">Name:Image 2 <br><'>, <Selector xpath='/html/body/div/a' data='<a href="image3.html">Name:Image 3 <br><'>, <Selector xpath='/html/body/div/a' data='<a href="image4.html">Name:Image 4 <br><'>, <Selector xpath='/html/body/div/a' data='<a href="image5.html">Name:Image 5 <br><'>]
[<Selector xpath='//a' data='<a href="image1.html">Name:Image 1 <br><'>, <Selector xpath='//a' data='<a href="image2.html">Name:Image 2 <br><'>, <Selector xpath='//a' data='<a href="image3.html">Name:Image 3 <br><'>, <Selector xpath='//a' data='<a href="image4.html">Name:Image 4 <br><'>, <Selector xpath='//a' data='<a href="image5.html">Name:Image 5 <br><'>]
[<Selector xpath='/html/body//img' data='<img src="image1.jpg">'>, <Selector xpath='/html/body//img' data='<img src="image2.jpg">'>, <Selector xpath='/html/body//img' data='<img src="image3.jpg">'>, <Selector xpath='/html/body//img' data='<img src="image4.jpg">'>, <Selector xpath='/html/body//img' data='<img src="image5.jpg">'>]
[<Selector xpath='/html/body//a' data='<a href="image1.html">Name:Image 1 <br><'>, <Selector xpath='/html/body//a' data='<a href="image2.html">Name:Image 2 <br><'>, <Selector xpath='/html/body//a' data='<a href="image3.html">Name:Image 3 <br><'>, <Selector xpath='/html/body//a' data='<a href="image4.html">Name:Image 4 <br><'>, <Selector xpath='/html/body//a' data='<a href="image5.html">Name:Image 5 <br><'>]
['Name:Image 1 ', 'Name:Image 2 ', 'Name:Image 3 ', 'Name:Image 4 ', 'Name:Image 5 ']
[<Selector xpath='/html/*' data='<head>\n\t\t<base href="http://example.com"'>, <Selector xpath='/html/*' data='<body>\n\t\t<div id="images">\n\t\t\t<a href="i'>]
[<Selector xpath='//body/*' data='<div id="images">\n\t\t\t<a href="image1.htm'>]
[<Selector xpath='//div/*/img' data='<img src="image1.jpg">'>, <Selector xpath='//div/*/img' data='<img src="image2.jpg">'>, <Selector xpath='//div/*/img' data='<img src="image3.jpg">'>, <Selector xpath='//div/*/img' data='<img src="image4.jpg">'>, <Selector xpath='//div/*/img' data='<img src="image5.jpg">'>]
[<Selector xpath='//img/@src' data='image1.jpg'>, <Selector xpath='//img/@src' data='image2.jpg'>, <Selector xpath='//img/@src' data='image3.jpg'>, <Selector xpath='//img/@src' data='image4.jpg'>, <Selector xpath='//img/@src' data='image5.jpg'>]
[<Selector xpath='//@href' data='http://example.com'>, <Selector xpath='//@href' data='image1.html'>, <Selector xpath='//@href' data='image2.html'>, <Selector xpath='//@href' data='image3.html'>, <Selector xpath='//@href' data='image4.html'>, <Selector xpath='//@href' data='image5.html'>]
['http://example.com', 'image1.html', 'image2.html', 'image3.html', 'image4.html', 'image5.html']
[<Selector xpath='//a[1]/img/@*' data='image1.jpg'>]
<Selector xpath='//a' data='<a href="image1.html">Name:Image 1 <br><'>
[<Selector xpath='//img' data='<img src="image1.jpg">'>, <Selector xpath='//img' data='<img src="image2.jpg">'>, <Selector xpath='//img' data='<img src="image3.jpg">'>, <Selector xpath='//img' data='<img src="image4.jpg">'>, <Selector xpath='//img' data='<img src="image5.jpg">'>]
[<Selector xpath='.//img' data='<img src="image1.jpg">'>]
[<Selector xpath='..//img' data='<img src="image1.jpg">'>, <Selector xpath='..//img' data='<img src="image2.jpg">'>, <Selector xpath='..//img' data='<img src="image3.jpg">'>, <Selector xpath='..//img' data='<img src="image4.jpg">'>, <Selector xpath='..//img' data='<img src="image5.jpg">'>]
[<Selector xpath='//a[3]' data='<a href="image3.html">Name:Image 3 <br><'>]
[<Selector xpath='//a[last()]' data='<a href="image5.html">Name:Image 5 <br><'>]
[<Selector xpath='//a[position()<=2]' data='<a href="image1.html">Name:Image 1 <br><'>, <Selector xpath='//a[position()<=2]' data='<a href="image2.html">Name:Image 2 <br><'>]
[<Selector xpath='//div[@id]' data='<div id="images">\n\t\t\t<a href="image1.htm'>]
[<Selector xpath='//div[@id="images"]' data='<div id="images">\n\t\t\t<a href="image1.htm'>]

Process finished with exit code 0

发布了23 篇原创文章 · 获赞 1 · 访问量 1028
展开阅读全文

没有更多推荐了,返回首页

©️2019 CSDN 皮肤主题: 1024 设计师: 上身试试

分享到微信朋友圈

×

扫一扫,手机浏览