用爬虫手撕王者荣耀全网个性化头像

黑客知识

已于 2022-10-26 23:40:42 修改

阅读量268

点赞数

分类专栏： python 文章标签：爬虫 python chrome

于 2022-10-26 23:32:49 首次发布

本文链接：https://blog.csdn.net/zhiboqingyun/article/details/127543251

版权

python 专栏收录该内容

26 篇文章

订阅专栏

一、代码详解

import requests

# 爬虫提取数据方法：标签、正则、xpath，此处为标签选择方法
import parsel

# 导入正则
import re

# 此处引用，主要是用time.sleep(1)函数
import time

# 请求网址
url = 'https://pvp.qq.com/web201605/herolist.shtml'

# 请求头
headers = {
    'cookie': 'tvfe_boss_uuid=5802db82d407f220; pgv_pvid=5873823518; _clck=3273596923|1|f60|0; isHostDate=19291; PTTuserFirstTime=1666742400000; isOsSysDate=19291; PTTosSysFirstTime=1666742400000; isOsDate=19291; PTTosFirstTime=1666742400000; pgv_info=ssid=s8005661472; ts_uid=1129461762; weekloop=0-0-0-44; ieg_ingame_userid=afgcpYA0tATf1gtpXmSC1xoI5EB88qsk; eas_sid=11j6Z6w6g7q9M2Z8s657e2W6v2; ts_last=pvp.qq.com/web201605/herodetail/548.shtml; pvpqqcomrouteLine=herolist_herolist_herodetail_herodetail; PTTDate=1666793104885',
    'if-modified-since': 'Wed, 26 Oct 2022 14:00:00 GMT',
    'referer': 'https://pvp.qq.com/web201605/herolist.shtml',
    'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="102", "Google Chrome";v="102"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'sec-fetch-dest': 'document',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-site': 'same-origin',
    'sec-fetch-user': '?1',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36',
}

# 发送第一次请求
response = requests.get(url=url, headers=headers)

# 继承父目录的编码格式
response.encoding = response.apparent_encoding

# 将响应json字符串，转换为标签对象
selector = parsel.Selector(response.text)

# 通过Selector实例对象的方法，获取响应数据的css样式数据
lis = selector.css('body > div.wrapper > div > div > div.herolist-box > div.herolist-content > ul > li')

# 遍历li对象
for li in lis:
    href = 'https://pvp.qq.com/web201605/' + li.css('a::attr(href)').get()
    title = li.css('a::text').get()

    # 如果标题存在
    if title:
        
        # 发送二次请求
        resp = requests.get(url=href, headers=headers)
        sel  = parsel.Selector(resp.text)
        data_href = sel.css('body > div.wrapper > div.zk-con1.zk-con::attr(style)').get()

        image_url = 'https:' + str(re.findall('//.*.jpg', data_href)[0])
        # time.sleep(1)

        print(title, image_url)

        # 发送第三次请求，并指定响应数据以流的形式返回
        content = requests.get(url=image_url, headers=headers).content

        # 写数据，到本地目录image，并指定文件的格式和名字
        with open('image/' + title + '.jpg', mode='wb') as f:
            f.write(content)