文章目录
1.xpath常见筛选方式
测试网页源代码
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>模拟登录</title>
</head>
<body background="e6fa7ebdb49b3f57569742132926fc4d.jpg" style="background-size: cover">
<h1 style="text-align: center">
欢迎注册
</h1>
<h1 style="text-align: center">
好好学习,天天向 上
</h1>
<form action="" method="post" style="text-align:center">
<input name="user" placeholder="昵称" type="text"><br><br>
<input name="password" placeholder="密码" type="text"><br><br>
<input name="XXX" value="1" type="radio">我同意<a href="https://ti.qq.com/agreement/index.html">服务协议</a>与
<a href="https://rule.tencent.com/rule/preview/3fd52bde-6555-453b-9ab8-c5f1f3d22c62">隐私保护协议</a><br><br>
<input type="submit" value="立即注册">
</form>
</body>
</html>
- //获取标签所有子节点,多个节点名以列表的形式返回
from lxml import etree
file = open("模拟登录.html", "r", encoding='utf-8')
html_data = file.read()
# print(html_data)
# html节点选择
html = etree.HTML(html_data)
print(html.xpath('//form')) # 获取<form>节点
print(html.xpath('//h1')) # 获取所有的<h1>节点
- /从根节点出发获取节点
from lxml import etree
file = open("模拟登录.html", "r", encoding='utf-8')
html_data = file.read()
# print(html_data)
# html节点选择
html = etree.HTML(html_data)
print(html.xpath('/html')) # 获取<html>节点
- .从当前路径节点
from lxml import etree
file = open("模拟登录.html", "r", encoding='utf-8')
html_data = file.read()
# print(html_data)
# html节点选择
html = etree.HTML(html_data)
print(html.xpath('//body')[0].xpath('./h1')) # 获取<body>节点下的所有<h1>节点
- 返回上一个节点
from lxml import etree
file = open("模拟登录.html", "r", encoding='utf-8')
html_data = file.read()
# print(html_data)
# html节点选择
html = etree.HTML(html_data)
print(html.xpath('//h1')[0].xpath('..')) # 获取第一个<h1>节点的父节点<body>节点
- @选取属性,返回属性值的列表
from lxml import etree
file = open("模拟登录.html", "r", encoding='utf-8')
html_data = file.read()
# print(html_data)
# html节点选择
html = etree.HTML(html_data)
print(html.xpath('//@style')) # 查找任意节点有class选项的节点,返回属性值的列表
- 路径表达式
from lxml import etree
file = open("模拟登录.html", "r", encoding='utf-8')
html_data = file.read()
# print(html_data)
# html节点选择
html = etree.HTML(html_data)
print(html.xpath('/html/body/h1')) # 获取h1节点
print(html.xpath('/html/head/title')) # 获取title节点
print(html.xpath('//body/h1')) # 获取h1节点
谓语
谓语:获取单个元素。
注意:谓语的列表下标从1开始,如果你要获取第一个元素下标写1不是0
from lxml import etree
file = open("模拟登录.html", "r", encoding='utf-8')
html_data = file.read()
# print(html_data)
# html节点选择
html = etree.HTML(html_data)
# 谓语
print(html.xpath('//body/h1[1]')) # 获取h1列表的第一个元素
print(html.xpath('//body/h1[last()]')) # 获取h1列表最后一个元素,last()-1是倒数第二个
print(html.xpath('//body/h1[position()<3]')) # 获取h1列表的前两个元素
print(html.xpath('//body[@style="background-size: cover"]')) #获取body标签下属性为tyle="background-size: cover"的节点
# 如果想获取标签的文本 text()
print(html.xpath('//body[@style="background-size: cover"][1]/h1/text()'))
- |返回两种标签
print(html.xpath('//body|//h1'))
2.爬取信息实战
import requests
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 '
'Safari/537.36 '
}
# 需要爬取的数据
# //*[@id="content"]/div[1]/ul/li[1]/div[1]
# 标题: print(html.xpath('//*[@id="content"]/div[1]/ul/li/div[1]/div[1]/a/text()'))
# 位置:
# pos_first = html.xpath('//*[@id="content"]/div[1]/ul/li/div[1]/div[2]/div/a[1]/text()')
# pos_sec = html.xpath('//*[@id="content"]/div[1]/ul/li/div[1]/div[2]/div/a[2]/text()')
# 价格: //*[@id="content"]/div[1]/ul/li[1]/div[1]/div[6]
# print(html.xpath('//*[@id="content"]/div[1]/ul/li/div[1]/div[6]/div[1]/span/text()'))
# 每平米多少元
# print(html.xpath('//*[@id="content"]/div[1]/ul/li/div[1]/div[6]/div[2]/span/text()'))
# 爬取4页数据
file = open('data.csv', 'w+', encoding='gbk')
file.write("标题,区域,位置,价格,一平米多少元\n")
for page in range(1, 5):
print(f'正在爬取第{page}页数据')
url = f'https://ty.lianjia.com/ershoufang/pg{page}/'
response = requests.get(url, headers=headers)
if response.status_code != 200:
print('网页请求失败')
else:
html = etree.HTML(response.text)
title = html.xpath('//*[@id="content"]/div[1]/ul/li/div[1]/div[1]/a/text()')
pos_first = html.xpath('//*[@id="content"]/div[1]/ul/li/div[1]/div[2]/div/a[1]/text()')
pos_sec = html.xpath('//*[@id="content"]/div[1]/ul/li/div[1]/div[2]/div/a[2]/text()')
prise = html.xpath('//*[@id="content"]/div[1]/ul/li/div[1]/div[6]/div[1]/span/text()')
per_prise = html.xpath('//*[@id="content"]/div[1]/ul/li/div[1]/div[6]/div[2]/span/text()')
msg = zip(title, pos_first, pos_sec, prise, per_prise)
print('正在写入中')
for _title, _pos_first, _pos_sec, _prise, _per_prise in msg:
file.write(f'{_title},{_pos_first},{_pos_sec},{_prise}万,{_per_prise}\n')
print('4页数据爬取完毕')
file.close()