python爬虫：2024最新华为应用市场app实战，附代码

MATANGLIANG

已于 2024-11-08 10:35:34 修改

阅读量1.3k

点赞数 4

文章标签： python 爬虫华为

于 2024-10-12 10:16:54 首次发布

本文链接：https://blog.csdn.net/Evan1of1/article/details/142870799

版权

文章目录

概要

逆向方法很简单，请求getInterfaceCode接口

整体架构流程

请求getInterfaceCode接口,会直接返回Interface-Code的参数,将信息填加到params中去请求，将返回json数据

技术名词解释

getInterfaceCode = "https://web-drcn.hispace.dbankcloud.com/edge/webedge/getInterfaceCode"

技术细节

主调用代码，通过调用工具类tools代码，获取会话id，在请求各个接口

import requests
from tools import getInterfaceCode,getrealTabId


class HuaweiSpiderApp:
    def __init__(self):
        self.getInterfaceCode = getInterfaceCode()
        self.headers = {
            'Accept': 'application/json, text/plain, */*',
            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
            'Connection': 'keep-alive',
            'Interface-Code': self.getInterfaceCode,
            'Origin': 'https://appgallery.huawei.com',
            'Referer': 'https://appgallery.huawei.com/',
            'Sec-Fetch-Dest': 'empty',
            'Sec-Fetch-Mode': 'cors',
            'Sec-Fetch-Site': 'cross-site',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36 Edg/129.0.0.0',
            'sec-ch-ua': '"Microsoft Edge";v="129", "Not=A?Brand";v="8", "Chromium";v="129"',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '"Windows"',
        }

    def data_list(self):
        headers = self.headers
        list_data = getrealTabId(self.getInterfaceCode)
        data_dict = {}
        for i in list_data:
            page = 1
            while True:
                print(f'正在采集第{page}页')
                params = {
                    'method': 'internal.getTabDetail',
                    'serviceType': '20',
                    'reqPageNum': page,  # 做循环
                    'uri': i,
                    'maxResults': '50',
                    'zone': '',
                    'locale': 'zh',
                }
                response = requests.get('https://web-drcn.hispace.dbankcloud.com/edge/uowap/index', params=params,
                                        headers=headers)
                layoutData = response.json()['layoutData']
                if len(layoutData) == 0:
                    print(f'类型{i}采集结束，共采集{page}页')
                    break
                data = layoutData[0]['dataList']
                print(f'数据量：{len(data)}')
                #搜索这个接口信息，可添加更多信息
                for item in data:
                    data_dict['appid'] = item['appid']
                    data_dict['name'] = item['name']
                    data_dict['kindName'] = item['kindName']
                    data_dict['downCountDesc'] = item['downCountDesc']
                    data_dict['intro'] = item['intro']
                    data_dict['package'] = item['package']
                    data_dict['score'] = item['score']
                    data_dict['tagName'] = item['tagName']
                    data_dict['appVersionName'] = item['appVersionName']
                    data_dict['enterprise']  = self.data_intro(item['appid'])
                    print(data_dict)
                page += 1


    def data_intro(self,appid):
        #简介内容
        headers = self.headers
        params = {
            'method': 'internal.getTabDetail',
            'serviceType': '20',
            'reqPageNum': '1',
            'maxResults': '25',
            'uri': f'app|{appid}',
            'shareTo': '',
            'currentUrl': 'https%3A%2F%2Fappgallery.huawei.com%2Fapp%2FC107863167',
            'accessId': '',
            'appid': appid,
            'zone': '',
            'locale': 'zh',
        }

        response = requests.get('https://web-drcn.hispace.dbankcloud.com/edge/uowap/index', params=params,
                                headers=headers)
        intra_data = response.json()['layoutData']
        data = intra_data[8]['dataList']
        for item in data:
            return item['developer']

if __name__ == '__main__':
    r= HuaweiSpiderApp()
    r.data_list()

工具类tools

import requests
import re
def getInterfaceCode():
    # 获取会话id
    headers = {
        'Accept': 'application/json, text/plain, */*',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
        'Connection': 'keep-alive',
        'Content-Type': 'application/json',
        'Origin': 'https://appgallery.huawei.com',
        'Referer': 'https://appgallery.huawei.com/',
        'Sec-Fetch-Dest': 'empty',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Site': 'cross-site',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36 Edg/129.0.0.0',
        'sec-ch-ua': '"Microsoft Edge";v="129", "Not=A?Brand";v="8", "Chromium";v="129"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
    }

    json_data = {
        'params': {},
        'zone': '',
        'locale': 'zh',
    }
    url = 'https://web-drcn.hispace.dbankcloud.com/edge/webedge/getInterfaceCode'
    response = requests.post(
        url,
        headers=headers,
        json=json_data,
    )
    InterfaceCode = response.text
    cleaned_string = re.sub(r'"', '', InterfaceCode)
    return cleaned_string

def getrealTabId(getInterfaceCode):
    #获取所有分类id
    headers = {
        'Accept': 'application/json, text/plain, */*',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
        'Connection': 'keep-alive',
        'Interface-Code': getInterfaceCode,
        'Origin': 'https://appgallery.huawei.com',
        'Referer': 'https://appgallery.huawei.com/',
        'Sec-Fetch-Dest': 'empty',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Site': 'cross-site',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36 Edg/129.0.0.0',
        'sec-ch-ua': '"Microsoft Edge";v="129", "Not=A?Brand";v="8", "Chromium";v="129"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
    }
    params = {
        'method': 'internal.getTabDetail',
        'serviceType': '20',
        'reqPageNum': '1',
        'uri': 'b2b4752f0a524fe5ad900870f88c11ed',
        'maxResults': '25',
        'zone': '',
        'locale': 'zh',
    }

    response = requests.get('https://web-drcn.hispace.dbankcloud.com/edge/uowap/index', params=params, headers=headers)
    list = response.json()['tabInfo']
    realTabId = []
    for i in list:
        for j in i['tabInfo']:
            id = j['realTabId']
            title = j['tabName']
            realTabId.append(id)
    return realTabId