一、代码详解
import requests
# 爬虫提取数据方法:标签、正则、xpath,此处为标签选择方法
import parsel
# 导入正则
import re
# 此处引用,主要是用time.sleep(1)函数
import time
# 请求网址
url = 'https://pvp.qq.com/web201605/herolist.shtml'
# 请求头
headers = {
'cookie': 'tvfe_boss_uuid=5802db82d407f220; pgv_pvid=5873823518; _clck=3273596923|1|f60|0; isHostDate=19291; PTTuserFirstTime=1666742400000; isOsSysDate=19291; PTTosSysFirstTime=1666742400000; isOsDate=19291; PTTosFirstTime=1666742400000; pgv_info=ssid=s8005661472; ts_uid=1129461762; weekloop=0-0-0-44; ieg_ingame_userid=afgcpYA0tATf1gtpXmSC1xoI5EB88qsk; eas_sid=11j6Z6w6g7q9M2Z8s657e2W6v2; ts_last=pvp.qq.com/web201605/herodetail/548.shtml; pvpqqcomrouteLine=herolist_herolist_herodetail_herodetail; PTTDate=1666793104885',
'if-modified-since': 'Wed, 26 Oct 2022 14:00:00 GMT',
'referer': 'https://pvp.qq.com/web201605/herolist.shtml',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="102", "Google Chrome";v="102"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36',
}
# 发送第一次请求
response = requests.get(url=url, headers=headers)
# 继承父目录的编码格式
response.encoding = response.apparent_encoding
# 将响应json字符串,转换为标签对象
selector = parsel.Selector(response.text)
# 通过Selector实例对象的方法,获取响应数据的css样式数据
lis = selector.css('body > div.wrapper > div > div > div.herolist-box > div.herolist-content > ul > li')
# 遍历li对象
for li in lis:
href = 'https://pvp.qq.com/web201605/' + li.css('a::attr(href)').get()
title = li.css('a::text').get()
# 如果标题存在
if title:
# 发送二次请求
resp = requests.get(url=href, headers=headers)
sel = parsel.Selector(resp.text)
data_href = sel.css('body > div.wrapper > div.zk-con1.zk-con::attr(style)').get()
image_url = 'https:' + str(re.findall('//.*.jpg', data_href)[0])
# time.sleep(1)
print(title, image_url)
# 发送第三次请求,并指定响应数据以流的形式返回
content = requests.get(url=image_url, headers=headers).content
# 写数据,到本地目录image,并指定文件的格式和名字
with open('image/' + title + '.jpg', mode='wb') as f:
f.write(content)
二、效果展示