xpath
xpath简介
xpath是一种html和xml的查询语言,它能够在xml和html的树状结构中寻找节点
在文档中通过元素和属性进行导航
简单一句话是可以根据地址找人的技术
xpath-helper的使用
常用节点选择工具
- chrome插件 XPathHelper
- Firefox插件 XPathHelper
安装好之后点击ctrl + shift + x启动或关闭插件
element对象
# 在Python中,我们安装lxml库来使用xpath技术
# pip install lxml
from lxml import etree
# etree这个类具体能做什么?
# 第一个 将html字符转换成为element对象
# 第二个 element对象可以转换为字符串或者二进制的类型
wb_data = """
<div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
"""
html = etree.HTML(wb_data)
result = etree.tostring(html) # 类型是字节 <class 'bytes'>
print(result.decode())
xpath
from lxml import etree
MyStr = '''<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<meta name="renderer" content="webkit">
<meta name="referrer" content="always">
<meta name="google-site-verification" content="ok0wCgT20tBBgo9_zat2iAcimtN4Ftf5ccsh092Xeyw" />
<meta name="baidu-site-verification" content="cZdR4xxR7RxmM4zE" />
<meta http-equiv="Pragma" content="no-cache">
<meta http-equiv="Expires" content="Sun, 6 Mar 2005 01:00:00 GMT">
<link rel="apple-touch-icon" href="https://img3.doubanio.com/f/movie/d59b2715fdea4968a450ee5f6c95c7d7a2030065/pics/movie/apple-touch-icon.png">
<link href="https://img3.doubanio.com/f/shire/3e5dfc68b0f376484c50cf08a58bbca3700911dc/css/douban.css" rel="stylesheet" type="text/css">
<link href="https://img3.doubanio.com/f/shire/ae3f5a3e3085968370b1fc63afcecb22d3284848/css/separation/_all.css" rel="stylesheet" type="text/css">
<link href="https://img3.doubanio.com/f/movie/8864d3756094f5272d3c93e30ee2e324665855b0/css/movie/base/init.css" rel="stylesheet">
<script type="text/javascript">var _head_start = new Date();</script>
<script type="text/javascript" src="https://img3.doubanio.com/f/movie/0495cb173e298c28593766009c7b0a953246c5b5/js/movie/lib/jquery.js"></script>
<script type="text/javascript" src="https://img3.doubanio.com/f/shire/5ecaf46d6954d5a30bc7d99be86ae34031646e00/js/douban.js"></script>
<script type="text/javascript" src="https://img3.doubanio.com/f/shire/0efdc63b77f895eaf85281fb0e44d435c6239a3f/js/separation/_all.js"></script>
'''
HtmlElement = etree.HTML(MyStr)
# 需求我要找href所对应的数据 //link/@href
EleList = HtmlElement.xpath('//link/@href')
# 提取数据 就可以遍历这个列表
for i in EleList:
print(i)
节点之间的关系
表达式 | 描述 |
---|---|
nodename | 选取此节点的所有子节点 |
/ | 从根节点选取 |
// | 从匹配选择的当前节点选择文档中的节点,而不考虑它们的位置 |
. | 选取当前节点 |
… | 选取当前节点的父节点 |
@ | 选取属性 |
# 测试代码
'''
<bookstore>(文档的节点)
<book>
<title lang="en">Harry Potter</title>(属性节点)
<author>J K.Rowling</author>(元素的节点)
<year>2005</year>
<price>29.99</price>
</book>
</bookstore>
Nodename节点的名字 有七种类型的节点 元素 属性 文本 命名空间 处理指令 注释 文档(根)节点
book元素是 title author year price元素的父类
title author year price元素同胞 (拥有相同的父节点)
(先辈)某节点的父或者父的父 title元素的先辈是book元素和bookstore
'''
选取节点
选取节点
# 选取节点
from lxml import etree
html = etree.parse('./test.html',etree.HTMLParser())
# result = etree.tostring(html)
#
# # print(html) # <lxml.etree._ElementTree object at 0x0000007B9692A348>
# print(result)
# # //* 表示匹配所有的节点
# result = html.xpath('//*')
# print(result)
# 找到指定节点,获取所有li节点
# /从根节点选取 a/b/c
# result = html.xpath('//li/a')
# 获取scr2属性所对应的a标签的父节点class的名
# <li class="item-1"><a src2="address2.html">second</a></li>
# @class
# result = html.xpath('//a[@src2="link2.html"]/../@class')
# @ 获取属性值src
# result = html.xpath('//li/a/@src')
# 获取文本的时候 text()
# 获取属性class的值为item-0的li标签的子节点a标签的文本内容
# <li class="item-0"><a src="link1.html">first item</a></li>
result = html.xpath('//li[@class="item-0"]/a')
for i in result:
print(i.text)
print(result)
豆瓣练习
**注意:**遇到输出错误<Response [418]>时,加入headers即可,输出显示<Response [200]>时就正确往下执行
# 需求:获取电影的标题 引言 评分 网址 每一页的内容全部抓取并保存到csv文件当中
# https://movie.douban.com/top250?start=0&filter= 第一页
# https://movie.douban.com/top250?start=25&filter= 第二页
# https://movie.douban.com/top250?start=50&filter= 第三页
# https://movie.douban.com/top250?start=75&filter= 第四页
# 页数url的规律是 (当前页数-1)*25
import requests
import lxml.html
import csv
# 找到目标的url
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'}
doubanUrl = 'https://movie.douban.com/top250?start={}&filter='
# 找到我们想要的数据 html源代码
def getSource(url):
# 请求目标网页
response = requests.get(url,headers=headers)
# 以防出现乱码 编码格式
response.encoding = 'utf-8'
return response.text
# getSource(doubanUrl)
# 引言 评分 网址 标题 ---> 网页源代码中
def getEveryItem(source):
# 生成一个Html对象
selector = lxml.html.document_fromstring(source)
# 通过selector对象来找到 电影信息
movieItemList = selector.xpath('//div[@class="info"]')
# 定义一个变量 展示电影信息
# 展示出来的信息[{电影1},{电影2},{电影3}]
movieList = []
# 通过for循环来遍历 info电影信息
for eachMovie in movieItemList:
# 创建一个字典 保存电影信息 引言 评分 网址 标题
movieDict = {}
title = eachMovie.xpath('div[@class="hd"]/a/span[1]/text()') # 标题
otherTitle = eachMovie.xpath('//div[@class="hd"]/a/span[2]/text()') # 副标题
link = eachMovie.xpath('div[@class="hd"]/a/@href')[0] # url
star = eachMovie.xpath('div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()')[0] # 评分
quote = eachMovie.xpath('div[@class="bd"]/p[@class="quote"]/span/text()') # 引言 (名句)
if quote:
quote = quote[0]
else:
quote = ' '
# 把电影数据保存到字典当中
movieDict['title'] = ''.join(title + otherTitle)
movieDict['url'] = link
movieDict['star'] = star
movieDict['quote'] = quote
print(movieDict)
movieList.append(movieDict)
return movieList
# 写入数据 csv文件当中
def writeData(movieList):
with open('DoubanMovie.csv', 'w', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=['title', 'star', 'quote', 'url'])
writer.writeheader() # 写入表头
for each in movieList:
# 逐行写入
writer.writerow(each)
# 启动程序
if __name__ == '__main__':
movieList = []
# 因为一共有10页所以循环10次
for i in range(10):
# 获取url
pageLink = doubanUrl.format(i * 25)
print(pageLink)
source = getSource(pageLink)
movieList += getEveryItem(source) # movieList = movieList + getEveryItem(source)
# print(movieList[:10])
writeData(movieList)