今天听了老师的课,跟着做了一个爬取王者农药皮肤的爬虫
from bs4 import BeautifulSoup
import urllib.request
import requests
import json
import os
#王者荣耀皮肤爬取,爬取皮肤的json文件
v_herolist_url = urllib.request.urlopen("http://pvp.qq.com/web201605/js/herolist.json")
#请求网站
'''
json{key: value,........} 轻量级数据结构
'''
v_herolist = v_herolist_url.read().decode('utf-8') #转换编码
#字符串截取
#\xef\xbb\xbf ---- Python自动加入的编码方式声明
v_herolist = v_herolist.encode('utf8')[3:].decode('utf-8')
# print(v_herolist)
# #转换json格式
hero_json = json.loads(v_herolist)
#创建一个本地文件夹
hero_dir = 'G:\myhero1\\'
cnt = 0
for i in range(len(hero_json)):
hero_id = hero_json[i]['ename'] #英雄id
hero_name = hero_json[i]['cname'] #英雄名
skin_name = hero_json[i]['skin_name'].split('|') #皮肤名
if not os.path.exists(hero_dir):
os.mkdir(hero_dir)
for j in range(len(skin_name)):
try:
#在路径里创建jpg再代替他
hero_img = hero_dir + (hero_name + "-" + skin_name[j-1] + ".jpg")
url_aim = "http://game.gtimg.cn/images/yxzj/img201606/skin/hero-info/" + str(hero_id) + "/" + str(hero_id) + "-bigskin-" + str(j+1) + ".jpg"
urllib.request.urlretrieve(url_aim,hero_img)
cnt += 1
print("正在写入:" + hero_name + "-" + skin_name[j])
except(IndexError):
print("出现越界错误")
print("爬取完成! 共爬取:" + str(cnt) + "次")
效果图: