解析:xpath
xpath
xpath基本语法:
- 路径查询
//:查找所有子孙节点,不考虑层级关系
/:找直接子节点 - 谓词查询
//div[@id]
//div[@id=“mainontent”] - 属性查询
//@class - 模糊查询
//div[contains(@id, “he”)]
//div[starts-with(@id, “he”)] - 内容查询
//div/h1/text() - 逻辑运算
//div[@id=“head” and @class=“s_down”]
//title | //price
注意:xpath既可以解析本地文件也可以解析服务器响应的文件
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8"/>
<title>Title</title>
</head>
<body>
<ul>
<li>北京</li>
<li>上海</li>
<li>广州</li>
<li>深圳</li>
</ul>
<ul>
<li id="l1" class="c1">大连</li>
<li id="l2">哈尔滨</li>
<li id="c3">沈阳</li>
<li id="c4">长春</li>
</ul>
</body>
</html>
from lxml import etree
#xpath解析
# (1)本地文件
# (2)服务器响应的数据response.read().decode('utf-8')
# 解析本地文件etree.parse()
tree = etree.parse('spider_test.html')
#tree.xpath('xpath路径')
#查找ul下面的li
li_list = tree.xpath("//body/ul/li")
print(li_list)
print(len(li_list))
#查找所有有id的属性的li标签
li_list = tree.xpath("//ul/li[@id]/text()") #路径查询
print(li_list)
print(len(li_list))
#查找id为l1的标签
li_list = tree.xpath("//ul/li[@id='l1']/text()") #谓词查询
print(li_list)
print(len(li_list))
#查找到id为l1的li标签的class的属性值
li_list = tree.xpath("//ul/li[@id='l1']/@class") #属性查询
print(li_list)
print(len(li_list))
#查询id中包含l的li标签
li_list =tree.xpath('//ul/li[contains(@id, "l")]/text()') #模糊查询
print(li_list)
print(len(li_list))
#查询id的值以l开头的li标签
li_list =tree.xpath('//ul/li[starts-with(@id, "c")]/text()') #模糊查询
print(li_list)
print(len(li_list))
#查询id为l1和class为c1的li标签
li_list =tree.xpath('//ul/li[@id="l1" and @class="c1"]/text()') #逻辑运算
print(li_list)
print(len(li_list))
#查找到id为l1的li标签以及id为c3的li标签
li_list =tree.xpath('//ul/li[@id="l1"]/text() | //ul/li[@id="c3"]/text()') #逻辑运算
print(li_list)
print(len(li_list))
案例1:获取百度网站的百度一下
from lxml import etree
import urllib.request
# (1)获取网页源码 (2)解析 (3)打印结果
url = 'http://www.baidu.com'
headers = {
'User-Agent':'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
}
#请求对象的定制
request = urllib.request.Request(url=url, headers=headers)
#模拟浏览器访问服务器
response = urllib.request.urlopen(request)
#获取网页源码
content = response.read().decode('utf-8')
#解析网页源码
tree = etree.HTML(content)
#获取想要的数据,xpath的返回值是一个列表类型的数据
result = tree.xpath('//input[@id="su"]/@value')
print(result[0])
案例2:爬取站长素材的图片
#(1)获取网页源码 (2)解析 (3)下载
#url = 'https://sc.chinaz.com/tupian/shengdanjietupian.html' 第一页
#url = ’https://sc.chinaz.com/tupian/shengdanjietupian_2.html‘ 第二页
#url = ’https://sc.chinaz.com/tupian/shengdanjietupian_3.html‘ 第三页
#请求对象的定制
def create_request(page):
if page == 1:
url = 'https://sc.chinaz.com/tupian/shengdanjietupian.html'
else:
url = 'https://sc.chinaz.com/tupian/shengdanjietupian_' + str(page) + '.html'
headers = {
'User-Agegnt':'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
}
request = urllib.request.Request(url=url, headers=headers)
return request
def get_content(request):
# 模拟浏览器访问服务器
response = urllib.request.urlopen(request)
#获取网页源码
content = response.read().decode('utf-8')
return content
def download_jpg(content):
#解析网页源码
tree = etree.HTML(content)
name_list = tree.xpath('//body//div[@class="container"]//div[@class="item"]/img/@alt')
#一般涉及到图片的网站,会使用懒加载
data_original = tree.xpath('//body//div[@class="container"]//div[@class="item"]/img/@data-original')
for i in range(len(name_list)):
name = name_list[i]
jpg = data_original[i]
url = 'https:' + jpg
urllib.request.urlretrieve(url=url, filename='./Christmas_img/' + name + '.jpg')
if __name__ == '__main__':
start_page = int(input("请输入起始页码:"))
end_page = int(input("请输入结束页码:"))
for page in range(start_page, end_page+1):
#请求对象的定制
request = create_request(page)
#获取网页源码
content = get_content(request)
#下载图片
download_jpg(content)