记一次爬取TapTap预约榜、热搜榜实例

最新推荐文章于 2024-08-21 23:42:00 发布
huadaoguluo
最新推荐文章于 2024-08-21 23:42:00 发布
阅读量100
点赞数 3
文章标签： scipy python
本文链接：https://blog.csdn.net/huadaoguluo/article/details/141168824
版权
import json
import os
import re
from json import JSONDecodeError

import requests

from gamerobot.settings import STATIC_ROOT, logger
from gr_project.models import TapTapReserveInfo, TapTapHotSearchesInfo
from gr_project.new_game_subscription import save_icon
from utils.common_func import replace_long_string


class TapTapCrawler:
    """TapTap爬虫类"""

    def __init__(self):
        self.url = "https://www.taptap.cn/top/reserve"
        self.headers = {
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
            'accept-language': 'zh-CN,zh;q=0.9',
            'cache-control': 'max-age=0',
            'priority': 'u=0, i',
            'sec-ch-ua': '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '"Windows"',
            'sec-fetch-dest': 'document',
            'sec-fetch-mode': 'navigate',
            'sec-fetch-site': 'none',
            'sec-fetch-user': '?1',
            'upgrade-insecure-requests': '1',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
        }

    def request_interface(self, url: str) -> None or dict:
        """
        请求接口
        :param url: 请求的URL
        :return: 响应数据
        """
        response_data = None
        response = requests.request("GET", url, headers=self.headers)
        if response.status_code == 200:
            response_data = response.json()
        else:
            logger.error(f"request_interface fail：{response.content})")
            logger.error(f"request_interface fail api：{url})")
        return response_data

    def get_reserve(self, top: int = 10) -> list:
        """
        获取TapTap预约数
        :param top: 获取数量。10的倍数
        :return: 预约数列表
        """
        result = []
        for start in list(range(0, top, 10)):
            # limit最多只能请求10条，所以只有从from下手，请求接口
            url = f"https://www.taptap.cn/webapiv2/app-top/v2/hits?from={start}&limit=10&type_name=reserve&X-UA=V%3D1%26PN%3DWebApp%26LANG%3Dzh_CN%26VN_CODE%3D102%26LOC%3DCN%26PLT%3DPC%26DS%3DAndroid%26UID%3D96cefb73-0032-480c-b6b9-62bedd5883d8%26OS%3DWindows%26OSV%3D10%26DT%3DPC"
            response_data = self.request_interface(url)
            if response_data is None:
                continue

            for app in response_data["data"]["list"]:
                result.append({
                    "game_id": app["app"]["id"],
                    "desc": app["app"].get("rec_text"),
                    "project_name": app["app"]["title"],
                    "reserve_count": app["app"]["stat"]["reserve_count"],
                    "game_icon": app["app"]["icon"]["small_url"]
                })
        return result

    def requests_url(self, url: str) -> str:
        """
        请求网站，获取内容
        :param url: 请求的url
        :return: 返回请求到的内容
        """

        response = requests.get(url=url, headers=self.headers)
        return response.text

    @staticmethod
    def match_content(html_str: str) -> list or None:
        """
        正则匹配网页内容
        :param html_str: 网页内容
        :return: 返回匹配到的内容
        """
        # 正则表达式匹配从 [[ 开始到 }] 结束的内容，包括起始符和终止符
        pattern = r'\[\[.*?}\]'

        # 使用re.findall查找所有匹配项
        matches = re.findall(pattern, html_str)[0]

        try:
            result = json.loads(matches)
        except JSONDecodeError:
            # 匹配异常
            return None

        return result

    def retrieval_hot_games(self, html_list: [str, list, dict]) -> list:
        """
        检索游戏热搜
        :param html_list: 内容列表
        :return: 返回游戏热搜列表
        """
        game_list = []
        if "游戏热搜" not in html_list:
            return game_list
        # 获取热门游戏索引
        index = html_list.index("游戏热搜")
        try:
            hot_games = html_list[index + 1]
        except IndexError:
            return game_list

        # 遍历索引，获取游戏名称
        for index in hot_games:
            try:
                item = html_list[index]
            except IndexError:
                continue

            project_name = html_list[item["keyword"]]
            # 根据游戏名称查询icon和预约数
            # url = f"https://www.taptap.cn/webapiv2/search/v5/agg-search" \
            #       f"?X-UA=V%3D1%26PN%3DWebApp%26LANG%3Dzh_CN%26VN_CODE%3D102%26LOC%3DCN%26PLT%3DPC%26DS%3DAndro" \
            #       f"id%26UID%3D96cefb73-0032-480c-b6b9-62bedd5883d8%26OS%3DWindows%26OSV%3D10%26DT%3DPC" \
            #       f"&kw={item['keyword']}&types=mix"
            # response_data = self.request_interface(url)

            # 通过游戏名称爬取具体游戏信息
            app_info = self.search_app(project_name=project_name)
            if app_info is None:
                game_list.append({
                    "project_name": project_name,
                })
                continue
            else:
                game_list.append(app_info)

            # try:
            #     list_data = response_data["data"]["list"][0]["list"][0]
            #     app = list_data["brand"] if "brand" in list_data else list_data
            #
            #     # keyword or display_word
            #     game_list.append({
            #         "project_name": project_name,
            #         "game_id": app["app"]["id"],
            #         "desc": app["app"].get("rec_text"),
            #         "reserve_count": app.get("stat", {}).get("reserve_count"),
            #         "game_icon": app["app"]["icon"]["small_url"]
            #     })
            # except Exception:
            #     game_list.append({
            #         "project_name": project_name,
            #     })
        return game_list

    def get_hot_searches(self):
        """获取游戏热搜"""
        # 请求网站
        html_content = self.requests_url(url=self.url)
        # 匹配网站所需内容
        html_list = self.match_content(html_str=html_content)
        if html_list is None:
            pass
        hot_games = self.retrieval_hot_games(html_list=html_list)
        return hot_games

    @staticmethod
    def remove_html_tags(text):
        # 使用正则表达式来移除HTML标签
        clean_text = re.sub(r'<.*?>', '', text)
        return clean_text

    def get_app_reserve_info(self, html_list: [str, int, dict, list]):
        """
        获取游戏预约信息
        :param html_list: 内容列表
        :return: 返回游戏预约信息
        """
        try:
            index = list(html_list[2].values())[3]
            index_dict = html_list[index]

            # 获取预约数
            stat_index = index_dict["stat"]
            reserve_count_index = html_list[stat_index]["reserve_count"]
            reserve_count = html_list[reserve_count_index]
            # 获取描述
            description_index = index_dict["description"]
            text_index = html_list[description_index]["text"]
            description = html_list[text_index]
            # icon
            icon_index = index_dict["icon"]
            small_url_index = html_list[icon_index]["small_url"]
            icon = html_list[small_url_index]
            # title
            title_index = index_dict["title"]
            title = html_list[title_index]
            # id
            id_index = index_dict["id"]
            game_id = html_list[id_index]
        except IndexError:
            return None

        desc = self.remove_html_tags(description)
        desc = replace_long_string(desc, length=15)
        result = {
            "game_id": game_id,
            "desc": desc,
            "project_name": title,
            "reserve_count": reserve_count,
            "game_icon": icon
        }
        return result

    def get_reserve_by_html(self):
        """获取游戏热搜，通过网站内容获取"""
        # 请求网站
        result = []
        for page in [1, 2, 3]:
            url = "https://www.taptap.cn/top/reserve?page={}".format(page)
            html_content = self.requests_url(url=url)
            # 匹配网站所需内容
            html_list = self.match_content(html_str=html_content)
            app_list = [i.replace("app:", "") for i in html_list if isinstance(i, str) and "app:" in i]

            for app_id in app_list:
                app_url = "https://www.taptap.cn/app/{}".format(app_id)
                html_content = self.requests_url(url=app_url)
                # 匹配网站所需内容
                html_list = self.match_content(html_str=html_content)
                if html_list is None:
                    continue
                try:
                    item = self.get_app_reserve_info(html_list=html_list)
                except Exception:
                    # 有概率失败，所以进行重试
                    app_url = "https://www.taptap.cn/app/{}".format(app_id)
                    html_content = self.requests_url(url=app_url)
                    # 匹配网站所需内容
                    html_list = self.match_content(html_str=html_content)
                    item = self.get_app_reserve_info(html_list=html_list)

                if item is None:
                    continue
                result.append(item)
        return result

    def search_app(self, project_name: str) -> dict or None:
        """
        搜索游戏，通过网站搜索
        :param project_name: 游戏名称
        :return: 游戏信息
        """
        url = "https://www.taptap.cn/search/{}".format(project_name)
        html_content = self.requests_url(url=url)

        # 匹配网站所需内容
        html_list = self.match_content(html_str=html_content)
        if html_list is None:
            return None
        mix_index = html_list.index("mix")
        mix_list = html_list[mix_index + 1]  # [276, 546, 636, 766, 823, 866, 921, 963, 995, 1049, 1085, 1261]
        # 第一个为最接近的结果
        first = html_list[mix_list[0]]  # {'type': 277, 'property': 13, 'identification': 278, 'brand': 279}
        brand = html_list[first["brand"]]  # {'app': 280, 'contents': 340, 'event_log': 541, 'stat': 543}
        app = html_list[brand["app"]]  # {'id': 281, 'title': 146, 'icon': 284, 'rec_text': 307, 'description': 310}
        stat = html_list[brand["stat"]]  # {'hits_total': 0, 'fans_count': 0, 'bought_count': 0, 'reserve_count': 0}

        icon = html_list[app["icon"]]  # {'url': 285, 'medium_url': 286, 'small_url': 287, 'original_url': 288}

        result = {
            "game_id": html_list[app["id"]],
            "desc": html_list[app["rec_text"]] if "rec_text" in app else None,
            "project_name": html_list[app["title"]],
            "reserve_count": html_list[stat["reserve_count"]] if "reserve_count" in stat else None,
            "game_icon": html_list[icon["small_url"]]
        }
        return result

    def save_database(self, date: str, update_time: str):
        """
        保存数据到数据库
        :param date: 日期
        :param update_time: 更新时间
        """
        # 创建一个新目录, 保存上传的图片
        path = os.path.join(STATIC_ROOT, 'game')
        if not os.path.exists(path):
            os.makedirs(path)

        # reserve_list = self.get_reserve(top=30)  # 服务器调用失败
        reserve_list = self.get_reserve_by_html()
        hot_searches = self.get_hot_searches()
        models = []
        for item in reserve_list:
            game_icon = os.path.join(path, "__INLINE__{}.png".format(item["game_id"]))
            res = save_icon(item['game_icon'], game_icon)
            if res:
                item["game_icon"] = '/static/game/__INLINE__{}.png'.format(item["game_id"])
            item["date"] = date
            item["update_time"] = update_time
            models.append(TapTapReserveInfo(**item))

        models_hot = []
        for item in hot_searches:
            if "game_id" in item and "game_icon" in item:
                game_icon = os.path.join(path, "__INLINE__{}.png".format(item["game_id"]))
                res = save_icon(item['game_icon'], game_icon)
                if res:
                    item["game_icon"] = '/static/game/__INLINE__{}.png'.format(item["game_id"])
            item["date"] = date
            item["update_time"] = update_time

            models_hot.append(TapTapHotSearchesInfo(**item))

        TapTapReserveInfo.objects.bulk_create(models)
        TapTapHotSearchesInfo.objects.bulk_create(models_hot)

# if __name__ == "__main__":
#     crawler = TapTapCrawler()
#     reserve_list = crawler.get_reserve(top=30)
#     print(reserve_list)
#     hot_searches = crawler.get_hot_searches()
#     print(hot_searches)