爬虫代码如下:
运行需要本地开启mongo服务器端,安装有谷歌浏览器及selenium对应版本的驱动文件
from selenium import webdriver
from lxml import etree
import re
import json
import pymongo
class Lol_spider(object):
def __init__(self):
self.driver = webdriver.Chrome(executable_path="D:/chromedriver_win32/chromedriver.exe")
def get_html_text(self, url):
self.driver.get(url)
return self.driver.page_source
def get_detail_page_url(self, html_text):
html = etree.HTML(html_text)
li_list = html.xpath("//ul[@class='imgtextlist']/li")
detail_urls = []
for li in li_list:
url = li.xpath("./a/@href")[0]
detail_urls.append("https://lol.qq.com/data/" + url)
return detail_urls
def parse_detali_page(self, text, hero_id):
html = etree.HTML(text)
item = {}
item["name"] = html.xpath("//a[@class='here']/text()")[0]
item["employment"] = html.xpath("//div[@class='defail-tags']/span/text()")[0]
item["physical_attack"] = re.search(r"\d+", html.xpath("//i[@class='up up1']/@style")[0]).group()
item["magic_attack"] = re.search(r"\d+", html.xpath("//i[@class='up up2']/@style")[0]).group()
item["defense_ability"] = re.search(r"\d+", html.xpath("//i[@class='up up3']/@style")[0]).group()
item["operation_difficulty"] = re.search(r"\d+", html.xpath("//i[@class='up up4']/@style")[0]).group()
item["skills"], item["skins"] = self.get_skillAndSkin(hero_id)
return item
def get_skillAndSkin(self, hero_id):
url = "https://game.gtimg.cn/images/lol/act/img/js/hero/" + hero_id + ".js"
browser = webdriver.Chrome(executable_path="D:/chromedriver_win32/chromedriver.exe")
browser.get(url)
data = browser.page_source.encode().decode("utf-8")
data = re.search(r'{"hero.+"}', data, re.S).group()
py_data = json.loads(data, encoding="ute-8")
skill_data = py_data["spells"]
skin_data = py_data["skins"]
skills = []
for s_d in skill_data:
sk_item = {}
sk_item["spell_Key"] = s_d["spellKey"]
sk_item["name"] = s_d["name"]
sk_item["description"] = s_d["description"]
skills.append(sk_item)
skins = []
for skin_d in skin_data:
skin_item = (skin_d["name"], skin_d["mainImg"])
skins.append(skin_item)
return skills, skins
def save_data(self, item):
mongo_client = pymongo.MongoClient()
mongodb = mongo_client["LOL"]
mongo_col = mongodb["legend_info"]
mongo_col.insert_one(item)
mongo_client.close()
print(item["name"] + " obtain sucessfully")
def run(self):
list_page_url = "https://lol.qq.com/data/info-heros.shtml"
list_page_html = self.get_html_text(list_page_url)
detail_urls =self.get_detail_page_url(list_page_html)
for d_u in detail_urls:
hero_id = re.search(r"\d+", d_u).group()
js = 'window.open("' + d_u + '")'
self.driver.execute_script(js)
self.driver.switch_to.window(self.driver.window_handles[1])
item = self.parse_detali_page(self.driver.page_source, hero_id)
self.save_data(item)
self.driver.close()
self.driver.switch_to.window(self.driver.window_handles[0])
if __name__ == '__main__':
ls = Lol_spider()
ls.run()
保存到mongo如下:
![在这里插入图片描述](https://i-blog.csdnimg.cn/blog_migrate/7906fce146f0885becb41fd61aeece86.png)
利用多线程将全部英雄的皮肤保存到本地代码如下
import pymongo
import queue
import requests
import time
import re
import threading
def download_img():
while url_name_q.not_empty:
u_n = url_name_q.get()
file_name = "./data/" + re.sub(r"/", "", u_n[0]) + ".jpg"
file_url = u_n[1]
if u_n[1] != "":
with open(file_name, "wb") as fp:
fp.write(requests.get(file_url).content)
print(u_n[0] + "save into local successfully")
url_name_q.task_done()
client = pymongo.MongoClient()
data = list(client["LOL"]["legend_info"].find({},{"_id": 0,"skins":1}))
url_name_list = []
for d in data:
url_name_list.append(d["skins"])
url_name_q = queue.Queue(3000)
for i in url_name_list:
for j in i:
url_name_q.put(j)
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
}
for i in range(10):
t = threading.Thread(target=download_img)
t.start()
下载结果如下:
![在这里插入图片描述](https://i-blog.csdnimg.cn/blog_migrate/53bc4799f635875598e9f32949eb42b2.png)