一、爬取图片
import requests, os
from lxml import etree
if __name__ == '__main__':
# 模拟浏览器请求
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36 Edg/90.0.818.39'
}
# 设置url
url = "https://pic.netbian.com/4kmeinv/"
# 获取url对应的html页面
page_text = requests.get(url=url, headers=headers).text
# 创建etree对象
tu_html = etree.HTML(page_text)
# 解析xml页面
li_list = tu_html.xpath("//div[@class='slist']/ul/li")
# 创建目录
if not os.path.exists("./4ktupian"):
os.mkdir("./4ktupian")
for li in li_list:
href = li.xpath("./a/img/@src")
img_url = "https://pic.netbian.com/"+href[0]
# 解决中文乱码
img_name = li.xpath("./a/img/@alt")[0] + '.jpg'
img_name = img_name.encode('iso-8859-1').decode('gbk')
# 获取图片的二进制形式
img_data = requests.get(url=img_url, headers=headers).content
#图片路径
img_path = './4ktupian/'+img_name
# 存储图片
with open(img_path, 'wb') as f:
f.write(img_data)
二、全国城市名称爬取
1.让两个xpath表达式都生效
tree.xpath('xpath表达式一' | 'xpath表达式二')
.