本文在jupyter notebook下可以直接运行,选取的站点为猫眼电影top排行版。
本文是在异步社区出版图书《精通python爬虫框架scrapy》的第二章的帮助下完成的
# -*- encoding:utf-8 -*-
import requests
from lxml import etree
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
}
url = "http://maoyan.com/board"
html = requests.get(url=url, headers=headers).text
html = etree.HTML(html)
html.xpath('/html')
[<Element html at 0x7f04b415fd88>]
result = etree.tostring(html)
print(result.decode('utf-8'))
# 网页信息太多,就不打印出来了
html.xpath("/html")
[<Element html at 0x7f04b415fd88>]
html.xpath("/html/body")
[<Element body at 0x7f04b4111508>]
html.xpath("/html/head/title")
[<Element title at 0x7f04bc642a88>]
使用text()获取文字
html.xpath("/html/head/title/text()")
['热映口碑榜 - 猫眼电影 - 一网打尽好电影']
利用 [] 指定选取
从1开始,不支持切片
html.xpath("/html/head/meta")
[<Element meta at 0x7f04b4111348>,
<Element meta at 0x7f04b4111b88>,
<Element meta at 0x7f04b410e5c8>,
<Element meta at 0x7f04b410e148>,
<Element meta at 0x7f04b410e548>,
<Element meta at 0x7f04b410e308>,
<Element meta at 0x7f04b410e848>,
<Element meta at 0x7f04b410e6c8>,
<Element meta at 0x7f04b410ea48>,
<Element meta at 0x7f04b410e948>]
html.xpath("/html/head/meta[2]")
[<Element meta at 0x7f04b4111b88>]
选取有name属性的meta
# 比上面的html.xpath("/html/head/meta")少了两个
html.xpath("/html/head/meta[@name]")
[<Element meta at 0x7f04b4111b88>,
<Element meta at 0x7f04b410e5c8>,
<Element meta at 0x7f04b410e308>,
<Element meta at 0x7f04b410e848>,
<Element meta at 0x7f04b410e6c8>,
<Element meta at 0x7f04b410ea48>,
<Element meta at 0x7f04b410e948>]
选取name属性名为description的元素
html.xpath("/html/head/meta[@name='description']")
[<Element meta at 0x7f04b410e5c8>]
使用@获取属性
可以获取标签内的所有属性,例如class href等
html.xpath("/html/head/meta[2]/@content")
['猫眼电影,电影排行榜,热映口碑榜,最受期待榜,国内票房榜,北美票房榜,猫眼TOP100']
/与//
/ 为绝对路径
// 为相对路径
获取head下所有的链接
html.xpath('//head//link/@href')
['//p0.meituan.net',
'//p1.meituan.net',
'//ms0.meituan.net',
'//ms1.meituan.net',
'//analytics.meituan.com',
'//report.meituan.com',
'//frep.meituan.com',
'//ms0.meituan.net/mywww/common.4b838ec3.css',
'//ms0.meituan.net/mywww/board-index.92a06072.css']
常见任务示例
获取第二部影片的超链接(先定位到包含所有影片的区域,在选取影片信息)
html.xpath('//dl[@class="board-wrapper"]/dd[2]/a/@href')
['/films/1212592']
选择class属性为movie-item开头的div标签的所有上映时间
html.xpath('//div[starts-with(@class,"movie-item")]/p[@class="releasetime"]/text()')
['上映时间:2018-07-05',
'上映时间:2018-07-27',
'上映时间:2018-07-20',
'上映时间:2018-06-22',
'上映时间:2018-07-06',
'上映时间:2018-06-15',
'上映时间:2018-07-20',
'上映时间:2018-07-20',
'上映时间:2018-06-29',
'上映时间:2018-07-14']
def select_by_axis():
"""
节点轴选择
"""
text = '''
<div>
<ul>
<li class="item-0"><a href="link1.html"><span>first item</span></a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
'''
html = etree.HTML(text)
result = html.xpath('//li[1]/ancestor::*')
print(result)
result = html.xpath('//li[1]/ancestor::div')
print(result)
result = html.xpath('//li[1]/attribute::*')
print(result)
result = html.xpath('//li[1]/child::a[@href="link1.html"]')
print(result)
result = html.xpath('//li[1]/descendant::span')
print(result)
result = html.xpath('//li[1]/following::*[2]')
print(result)
result = html.xpath('//li[1]/following-sibling::*')
print(result)
select_by_axis()
[<Element html at 0x7f04b40cbd08>, <Element body at 0x7f04b40d0ac8>, <Element div at 0x7f04b40d0b08>, <Element ul at 0x7f04b40d0b48>]
[<Element div at 0x7f04b40d0b08>]
['item-0']
[<Element a at 0x7f04b40d0b08>]
[<Element span at 0x7f04b40d0bc8>]
[<Element a at 0x7f04b40d0b08>]
[<Element li at 0x7f04b40d0ac8>, <Element li at 0x7f04b40d0c08>, <Element li at 0x7f04b40d0c48>, <Element li at 0x7f04b40d0c88>]
参考:w3c