在页面源代码请求出来的源码是假的数据,皮肤img的那块是统一格式的,拿不到想要的img,换种思路,抓包,从img分类里面,拿到皮肤图片的url,分析之后,只需要改动英雄的id 皮肤的url后面加上数字,则可以拿到正确的url,需要用到的知识,request请求,os可以写正则和创建文件夹,字符串分割,xpath数据解析,可以使用多线程加快数据的保存
中文乱码问题:查看charset字符集,用encoding改为相应的字符集即可
import requests
from lxml import etree
import os
import asyncio
if not os.path.exists("./王者荣耀皮肤"):
os.makedirs("./王者荣耀")
main_url = 'https://pvp.qq.com/web201605/herolist.shtml'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
}
response = requests.get(url=main_url,headers=headers)
response.encoding = 'gbk'
#拿到主页的text文档
page_text = response.text
#数据解析 拿到每个影响详情页的url
tree = etree.HTML(page_text)
hero_detail_url = tree.xpath('//ul[@class="herolist clearfix"]/li/a')
for hero in hero_detail_url:
#获取到了英雄的id
href = hero.xpath('./@href')[0]
hero_id = href.split('/')[1]
hero_id = hero_id.split('.')[0]#英雄id
hero_name = hero.xpath('./img/@alt')[0]#英雄名字
count = 1
for num in range(1,11):
#拿到皮肤的url
hero_url = 'https://game.gtimg.cn/images/yxzj/img201606/skin/hero-info/'+str(hero_id) + '/' +str(hero_id) + '-bigskin-' + str(num) + '.jpg'
pifu_data = requests.get(url=hero_url,headers=headers).content
if (pifu_data !="b'404 page not found\n'"):
#皮肤存放的文件夹
hero_file = "王者荣耀/" + hero_name
#皮肤的地址
hero_address = hero_file +'/'+ str(count) +'.jpg'
count =count +1
#文件夹不存在则创建
if not os.path.exists(hero_file):
os.makedirs(hero_file)
#将二进制数据写入皮肤地址
with open(hero_address,'wb') as fp:
fp.write(pifu_data)
print(hero_address,"保存成功")