前言
Xpath的基本使用
一、安装lxml库
进入python3.7目录
bogon:bin yingyan$ cd /Library/Frameworks/Python.framework/Versions/3.7/bin
安装lxml
bogon:bin yingyan$ pip3 install lxml -i https://pypi.douban.com/simple/
Looking in indexes: https://pypi.douban.com/simple/
Collecting lxml
Downloading https://pypi.doubanio.com/packages/98/9c/fbbbcafca14a8711c8a036375389cb8c5b1d40185357ae6fbb62d9658d41/lxml-4.9.2-cp310-cp310-macosx_10_15_x86_64.whl (4.7 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 4.7/4.7 MB 1.0 MB/s eta 0:00:00
Installing collected packages: lxml
Successfully installed lxml-4.9.2
WARNING: You are using pip version 22.0.4; however, version 23.0 is available.
You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.10/bin/python3.10 -m pip install --upgrade pip' command.
添加lxml库
引入包
from lxml import etree
二、使用步骤
1、基本使用
代码如下(示例):xpath基本使用.html
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8"/>
<title>Title</title>
</head>
<body>
<ul>
<li id="1" class="c1">北京</li>
<li id="2">上海</li>
<li id="l1">广州</li>
<li id="l2">深圳</li>
</ul>
<ul>
<li>长沙</li>
<li>武汉</li>
<li>成都</li>
<li>重庆</li>
</ul>
</body>
</html>
代码如下(示例):解析_xpath的基本使用.py
# _*_ coding : utf-8 _*_
# @Time : 2023/2/17 10:49 AM
# @Author : yanhh
# @File : 解析_xpath的基本使用
# @Project : pythonProject
from lxml import etree
# xpath解析
# 1)本地文件 etree.parse
# 2)服务器响应的数据 response.read().decode("utf8") 主要用这个 etree.HTML()
# xpath解析本地文件 [<Element li at 0x7faa32dd8388>, <Element li at 0x7faa32dd8488>...]
tree = etree.parse('xpath基本使用.html')
# print(tree) <lxml.etree._ElementTree object at 0x7fda863cb6c8>
# 基本语法 1路径查找 [<Element li at 0x7faa32dd8388>, <Element li at 0x7faa32dd8488>
# li_list = tree.xpath('//body//li')
# 2、谓词查找 查找所有有属性id的li标签
# 2.1 text获取标签中的内容 ['北京', '上海', '广州', '深圳']
# li_list = tree.xpath('//ul/li[@id]/text()')
# 2.2 查找所有有属性id为1的li标签,注意引号 ['北京']
# li_list = tree.xpath('//ul/li[@id="1"]/text()')
# 2.3 查找所有有属性id为1的li标签的 class的属性值 ['c1']
# li_list = tree.xpath('//ul/li[@id="1"]/@class')
# 3、模糊查询
# 标签id包含l ['广州', '深圳']
# li_list = tree.xpath('//ul/li[contains(@id,"l")]/text()')
# 以标签id为l开始starts-with l ['广州', '深圳']
# li_list = tree.xpath('//ul/li[starts-with(@id,"l")]/text()')
# 逻辑运算
# 1、['北京']
li_list = tree.xpath('//ul/li[@id="1" and @class="c1"]/text()')
print(li_list)
# 获取长度
print(len(li_list))
2.实战
1、代码如下(示例):xpath解析百度一下文字
# _*_ coding : utf-8 _*_
# @Time : 2023/2/17 3:11 PM
# @Author : yanhh
# @File : 解析_获取百度网站的百度一下文字
# @Project : pythonProject
import urllib.request
import ssl
from lxml import etree
ssl._create_default_https_context = ssl._create_unverified_context
url = 'http://www.baidu.com'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
}
request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf8')
# 服务器响应的数据 response.read().decode("utf8") 主要用这个 etree.HTML()
tree = etree.HTML(content)
# 列表中文字
list = tree.xpath('//input[@id="su"]/@value')[0]
# 百度一下
print(list)
2、站长素材
1、代码如下(示例):下载图片到指定文件夹下
# _*_ coding : utf-8 _*_
# @Time : 2023/2/17 4:17 PM
# @Author : yanhh
# @File : 解析_站长素材
# @Project : pythonProject
# 'https://sc.chinaz.com/tupian/huacaotupian_2.html'
# 'https://sc.chinaz.com/tupian/huacaotupian.html'
import urllib.request
from lxml import etree
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
def create_request(page):
if (page == 1):
url = 'https://sc.chinaz.com/tupian/huacaotupian.html'
else:
url = 'https://sc.chinaz.com/tupian/huacaotupian_' + str(page) + '.html'
# 测试打印前10页链接
# print(url)
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
}
request = urllib.request.Request(url=url, headers=headers)
return request
def get_content(request):
response = urllib.request.urlopen(request)
content = response.read().decode('utf8')
return content
def down_load(content):
tree = etree.HTML(content)
name_list = tree.xpath('//div[@class="container"]//img/@alt')
# 遍历name列表
# for name in name_list:
# print(name)
# 图片有可能遇到懒加载,此时可能是src2
src_list = tree.xpath('//div[@class="container"]//img/@data-original')
# 每一页个数 40 40
# print(len(url_list), len(name_list))
for i in range(len(src_list)):
name = name_list[i]
src = src_list[i]
url = 'https:' + src
# print(name, url)
# 下载到当前路径下(40条图片数据)
# urllib.request.urlretrieve(url=url, filename=name + '.jpg')
# 下载到文件夹路径下(400条图片数据)
urllib.request.urlretrieve(url=url, filename='huaImg/' + name + '.jpg')
if __name__ == '__main__':
start_page = int(input("请输入开始页码"))
end_page = int(input("请输入结束页码"))
for page in range(start_page, end_page + 1):
request = create_request(page)
content = get_content(request)
down_load(content)