import json
import os
import re
from json import JSONDecodeError
import requests
from gamerobot.settings import STATIC_ROOT, logger
from gr_project.models import TapTapReserveInfo, TapTapHotSearchesInfo
from gr_project.new_game_subscription import save_icon
from utils.common_func import replace_long_string
class TapTapCrawler:
"""TapTap爬虫类"""
def __init__(self):
self.url = "https://www.taptap.cn/top/reserve"
self.headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'max-age=0',
'priority': 'u=0, i',
'sec-ch-ua': '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
}
def request_interface(self, url: str) -> None or dict:
"""
请求接口
:param url: 请求的URL
:return: 响应数据
"""
response_data = None
response = requests.request("GET", url, headers=self.headers)
if response.status_code == 200:
response_data = response.json()
else:
logger.error(f"request_interface fail:{response.content})")
logger.error(f"request_interface fail api:{url})")
return response_data
def get_reserve(self, top: int = 10) -> list:
"""
获取TapTap预约数
:param top: 获取数量。10的倍数
:return: 预约数列表
"""
result = []
for start in list(range(0, top, 10)):
# limit最多只能请求10条,所以只有从from下手,请求接口
url = f"https://www.taptap.cn/webapiv2/app-top/v2/hits?from={start}&limit=10&type_name=reserve&X-UA=V%3D1%26PN%3DWebApp%26LANG%3Dzh_CN%26VN_CODE%3D102%26LOC%3DCN%26PLT%3DPC%26DS%3DAndroid%26UID%3D96cefb73-0032-480c-b6b9-62bedd5883d8%26OS%3DWindows%26OSV%3D10%26DT%3DPC"
response_data = self.request_interface(url)
if response_data is None:
continue
for app in response_data["data"]["list"]:
result.append({
"game_id": app["app"]["id"],
"desc": app["app"].get("rec_text"),
"project_name": app["app"]["title"],
"reserve_count": app["app"]["stat"]["reserve_count"],
"game_icon": app["app"]["icon"]["small_url"]
})
return result
def requests_url(self, url: str) -> str:
"""
请求网站,获取内容
:param url: 请求的url
:return: 返回请求到的内容
"""
response = requests.get(url=url, headers=self.headers)
return response.text
@staticmethod
def match_content(html_str: str) -> list or None:
"""
正则匹配网页内容
:param html_str: 网页内容
:return: 返回匹配到的内容
"""
# 正则表达式匹配从 [[ 开始到 }] 结束的内容,包括起始符和终止符
pattern = r'\[\[.*?}\]'
# 使用re.findall查找所有匹配项
matches = re.findall(pattern, html_str)[0]
try:
result = json.loads(matches)
except JSONDecodeError:
# 匹配异常
return None
return result
def retrieval_hot_games(self, html_list: [str, list, dict]) -> list:
"""
检索游戏热搜
:param html_list: 内容列表
:return: 返回游戏热搜列表
"""
game_list = []
if "游戏热搜" not in html_list:
return game_list
# 获取热门游戏索引
index = html_list.index("游戏热搜")
try:
hot_games = html_list[index + 1]
except IndexError:
return game_list
# 遍历索引,获取游戏名称
for index in hot_games:
try:
item = html_list[index]
except IndexError:
continue
project_name = html_list[item["keyword"]]
# 根据游戏名称查询icon和预约数
# url = f"https://www.taptap.cn/webapiv2/search/v5/agg-search" \
# f"?X-UA=V%3D1%26PN%3DWebApp%26LANG%3Dzh_CN%26VN_CODE%3D102%26LOC%3DCN%26PLT%3DPC%26DS%3DAndro" \
# f"id%26UID%3D96cefb73-0032-480c-b6b9-62bedd5883d8%26OS%3DWindows%26OSV%3D10%26DT%3DPC" \
# f"&kw={item['keyword']}&types=mix"
# response_data = self.request_interface(url)
# 通过游戏名称爬取具体游戏信息
app_info = self.search_app(project_name=project_name)
if app_info is None:
game_list.append({
"project_name": project_name,
})
continue
else:
game_list.append(app_info)
# try:
# list_data = response_data["data"]["list"][0]["list"][0]
# app = list_data["brand"] if "brand" in list_data else list_data
#
# # keyword or display_word
# game_list.append({
# "project_name": project_name,
# "game_id": app["app"]["id"],
# "desc": app["app"].get("rec_text"),
# "reserve_count": app.get("stat", {}).get("reserve_count"),
# "game_icon": app["app"]["icon"]["small_url"]
# })
# except Exception:
# game_list.append({
# "project_name": project_name,
# })
return game_list
def get_hot_searches(self):
"""获取游戏热搜"""
# 请求网站
html_content = self.requests_url(url=self.url)
# 匹配网站所需内容
html_list = self.match_content(html_str=html_content)
if html_list is None:
pass
hot_games = self.retrieval_hot_games(html_list=html_list)
return hot_games
@staticmethod
def remove_html_tags(text):
# 使用正则表达式来移除HTML标签
clean_text = re.sub(r'<.*?>', '', text)
return clean_text
def get_app_reserve_info(self, html_list: [str, int, dict, list]):
"""
获取游戏预约信息
:param html_list: 内容列表
:return: 返回游戏预约信息
"""
try:
index = list(html_list[2].values())[3]
index_dict = html_list[index]
# 获取预约数
stat_index = index_dict["stat"]
reserve_count_index = html_list[stat_index]["reserve_count"]
reserve_count = html_list[reserve_count_index]
# 获取描述
description_index = index_dict["description"]
text_index = html_list[description_index]["text"]
description = html_list[text_index]
# icon
icon_index = index_dict["icon"]
small_url_index = html_list[icon_index]["small_url"]
icon = html_list[small_url_index]
# title
title_index = index_dict["title"]
title = html_list[title_index]
# id
id_index = index_dict["id"]
game_id = html_list[id_index]
except IndexError:
return None
desc = self.remove_html_tags(description)
desc = replace_long_string(desc, length=15)
result = {
"game_id": game_id,
"desc": desc,
"project_name": title,
"reserve_count": reserve_count,
"game_icon": icon
}
return result
def get_reserve_by_html(self):
"""获取游戏热搜,通过网站内容获取"""
# 请求网站
result = []
for page in [1, 2, 3]:
url = "https://www.taptap.cn/top/reserve?page={}".format(page)
html_content = self.requests_url(url=url)
# 匹配网站所需内容
html_list = self.match_content(html_str=html_content)
app_list = [i.replace("app:", "") for i in html_list if isinstance(i, str) and "app:" in i]
for app_id in app_list:
app_url = "https://www.taptap.cn/app/{}".format(app_id)
html_content = self.requests_url(url=app_url)
# 匹配网站所需内容
html_list = self.match_content(html_str=html_content)
if html_list is None:
continue
try:
item = self.get_app_reserve_info(html_list=html_list)
except Exception:
# 有概率失败,所以进行重试
app_url = "https://www.taptap.cn/app/{}".format(app_id)
html_content = self.requests_url(url=app_url)
# 匹配网站所需内容
html_list = self.match_content(html_str=html_content)
item = self.get_app_reserve_info(html_list=html_list)
if item is None:
continue
result.append(item)
return result
def search_app(self, project_name: str) -> dict or None:
"""
搜索游戏,通过网站搜索
:param project_name: 游戏名称
:return: 游戏信息
"""
url = "https://www.taptap.cn/search/{}".format(project_name)
html_content = self.requests_url(url=url)
# 匹配网站所需内容
html_list = self.match_content(html_str=html_content)
if html_list is None:
return None
mix_index = html_list.index("mix")
mix_list = html_list[mix_index + 1] # [276, 546, 636, 766, 823, 866, 921, 963, 995, 1049, 1085, 1261]
# 第一个为最接近的结果
first = html_list[mix_list[0]] # {'type': 277, 'property': 13, 'identification': 278, 'brand': 279}
brand = html_list[first["brand"]] # {'app': 280, 'contents': 340, 'event_log': 541, 'stat': 543}
app = html_list[brand["app"]] # {'id': 281, 'title': 146, 'icon': 284, 'rec_text': 307, 'description': 310}
stat = html_list[brand["stat"]] # {'hits_total': 0, 'fans_count': 0, 'bought_count': 0, 'reserve_count': 0}
icon = html_list[app["icon"]] # {'url': 285, 'medium_url': 286, 'small_url': 287, 'original_url': 288}
result = {
"game_id": html_list[app["id"]],
"desc": html_list[app["rec_text"]] if "rec_text" in app else None,
"project_name": html_list[app["title"]],
"reserve_count": html_list[stat["reserve_count"]] if "reserve_count" in stat else None,
"game_icon": html_list[icon["small_url"]]
}
return result
def save_database(self, date: str, update_time: str):
"""
保存数据到数据库
:param date: 日期
:param update_time: 更新时间
"""
# 创建一个新目录, 保存上传的图片
path = os.path.join(STATIC_ROOT, 'game')
if not os.path.exists(path):
os.makedirs(path)
# reserve_list = self.get_reserve(top=30) # 服务器调用失败
reserve_list = self.get_reserve_by_html()
hot_searches = self.get_hot_searches()
models = []
for item in reserve_list:
game_icon = os.path.join(path, "__INLINE__{}.png".format(item["game_id"]))
res = save_icon(item['game_icon'], game_icon)
if res:
item["game_icon"] = '/static/game/__INLINE__{}.png'.format(item["game_id"])
item["date"] = date
item["update_time"] = update_time
models.append(TapTapReserveInfo(**item))
models_hot = []
for item in hot_searches:
if "game_id" in item and "game_icon" in item:
game_icon = os.path.join(path, "__INLINE__{}.png".format(item["game_id"]))
res = save_icon(item['game_icon'], game_icon)
if res:
item["game_icon"] = '/static/game/__INLINE__{}.png'.format(item["game_id"])
item["date"] = date
item["update_time"] = update_time
models_hot.append(TapTapHotSearchesInfo(**item))
TapTapReserveInfo.objects.bulk_create(models)
TapTapHotSearchesInfo.objects.bulk_create(models_hot)
# if __name__ == "__main__":
# crawler = TapTapCrawler()
# reserve_list = crawler.get_reserve(top=30)
# print(reserve_list)
# hot_searches = crawler.get_hot_searches()
# print(hot_searches)
记一次爬取TapTap预约榜、热搜榜实例
最新推荐文章于 2024-08-21 23:42:00 发布