一、实验原理
使用通用爬虫爬取网页数据
实例化etree对象,且将页面数据加载到该对象中
使用xpath函数结合xpath表达式进行标签定位和指定数据提取
1.1 etree对象实例化
- 本地文件:
tree = etree.parse(文件名)
tree.xpath(“xpath表达式”) - 网络数据:
tree = etree.HTML(网页内容字符串)
tree.xpath(“xpath表达式”)
1.2 使用规范
/
表示一个层级
//
表示多个层级
//
可以表示从任意位置开始定位
./
表示从当前目录开始
xpath表达式获取捷径:F12打开开发者工具,找到HTML源代码,定位到标签,右击复制xpath表达式。
二、实战案例
2.1 爬取二手房文本标题
import requests
from lxml import etree
if __name__ == "__main__":
url = 'https://bj.58.com/ershoufang/'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.188'
}
# 获取html页面内容
page_text = requests.get(url=url,headers=headers).text
# print(page_text)
# 页面内容xpath解析
tree = etree.HTML(page_text)
li_list = tree.xpath('//*[@id="esfMain"]/section/section[3]/section[1]/section[2]//div[@class="property-content-title"]')
fp = open('58.txt','w',encoding='utf-8')
# 存储的是标签对象列表
ll=len(li_list)
# 遍历每个对象并解析
for i in range(ll):
# 局部解析
title = li_list[i].xpath('./h3/text()')[0]
fp.write(title+'\n')
2.2 爬取图片
# 实战爬取图片
import requests
from lxml import etree
import os
if __name__ == "__main__":
url = 'http://pic.netbian.com/4Kyuanchuang/'
# UA伪装
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.188"
}
# 获取页面源代码
response = requests.get(url=url, headers=headers)
response.encoding = 'gbk'
response_text = response.text
# 获取局部页面
tree = etree.HTML(response_text)
li_list = tree.xpath('//*[@id="main"]/div[3]/ul/li')
print(li_list)
# len
ll = len(li_list)
# 创建存储图片的文件夹
if not os.path.exists('./piclibs/'):
os.mkdir('./piclibs/')
for i in range(ll):
title = li_list[i].xpath("./a/img/@alt")[0][-10:] + '.jpg'
img_src = li_list[i].xpath("./a/img/@src")[0]
img_url = "http://pic.netbian.com" + img_src
# 获取图片信息
response_img = requests.get(url=img_url, headers=headers).content
img_name = "./piclibs/" + title
with open(img_name, 'wb') as fp:
fp.write(response_img)
print(title + "保存成功!!!")
2.3 爬取全国城市的名称
# 爬取全国城市
import requests
from lxml import etree
if __name__ == "__main__":
url = 'https://www.aqistudy.cn/historydata/'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.188'
}
# 获取html页面内容
page_text = requests.get(url=url,headers=headers).text
# print(page_text)
# 页面内容xpath解析
tree = etree.HTML(page_text)
li_list = tree.xpath('/html/body/div[3]/div/div[1]/div[1]/div[2]/ul/li | /html/body/div[3]/div/div[1]/div[2]/div[2]/ul/div[2]/li')
fp = open('airdata.txt','w',encoding='utf-8')
# 存储的是标签对象列表
ll=len(li_list)
print(ll)
city_list = []
# 遍历每个对象并解析
for i in range(ll):
# 局部解析
city_name = li_list[i].xpath('./a/text()')[0]
print(city_name)
city_list.append(city_name)
fp.writelines(str(city_list))