python爬虫实例-王者荣耀皮肤(同时包含html、json数据)
概述
突发奇想尝试爬取王者荣耀各英雄的皮肤图片。想要达到的效果有2点:一是以每个英雄作为一个文件夹,二是每个文件夹下有该英雄所有以英雄名-皮肤名为名称的图片。
下面就来跟随我的思路分析、撸代码叭!
完整代码、效果展示
import requests,re,os,time
from bs4 import BeautifulSoup
headers = {
'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3775.400 QQBrowser/10.6.4208.400'}
# 爬取英雄编号
def spider_num():
url_all = 'https://pvp.qq.com/web201605/js/herolist.json' # url_all为所有英雄信息的json数据网址
res_all = requests.get(url_all,headers=headers)
json_all = res_all.json()
num_dic = {
}
for i in json_all:
num_dic[i['cname']] = i['ename'] # 此时获取到了{英雄名:英雄编号...}字典
return num_dic
# 爬取英雄皮肤名称
def spider_name(num_dic):
skin_dic = {
}
for key in num_dic:
name = key
num = num_dic.get(name)
url_skin = 'https://pvp.qq.com/web201605/herodetail/'+str(num)+'.shtml' # url_skin为某一英雄详情信息网址
res_skin = requests.get(url_skin,headers=headers)
res_skin.encoding = 'gbk'
bs_skin = BeautifulSoup(res_skin.text,'html.parser')
skin = bs_skin.find('ul',class_='pic-pf-list pic-pf-list3')
skin_str = skin['data-imgname'] # 获取属性'data-imgname'的内容
pattern = '[\u4e00-\u9fa5]+'
skin_list = re.findall(pattern,skin_str) # 正则表达式获取skin_str中的中文字符串(以列表形式返回),即各英雄的皮肤名
skin_dic[name] = skin_list # 此时获取到了{英雄名:[皮肤1,皮肤2...],...}字典
return skin_dic
# 爬取皮