python爬虫:2024最新华为应用市场app实战,附代码

概要

逆向方法很简单,请求getInterfaceCode接口

整体架构流程

请求getInterfaceCode接口,会直接返回Interface-Code的参数,将信息填加到params中去请求,将返回json数据

技术名词解释

getInterfaceCode = "https://web-drcn.hispace.dbankcloud.com/edge/webedge/getInterfaceCode"

技术细节

主调用代码,通过调用工具类tools代码,获取会话id,在请求各个接口

import requests
from tools import getInterfaceCode,getrealTabId


class HuaweiSpiderApp:
    def __init__(self):
        self.getInterfaceCode = getInterfaceCode()
        self.headers = {
            'Accept': 'application/json, text/plain, */*',
            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
            'Connection': 'keep-alive',
            'Interface-Code': self.getInterfaceCode,
            'Origin': 'https://appgallery.huawei.com',
            'Referer': 'https://appgallery.huawei.com/',
            'Sec-Fetch-Dest': 'empty',
            'Sec-Fetch-Mode': 'cors',
            'Sec-Fetch-Site': 'cross-site',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36 Edg/129.0.0.0',
            'sec-ch-ua': '"Microsoft Edge";v="129", "Not=A?Brand";v="8", "Chromium";v="129"',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '"Windows"',
        }

    def data_list(self):
        headers = self.headers
        list_data = getrealTabId(self.getInterfaceCode)
        data_dict = {}
        for i in list_data:
            page = 1
            while True:
                print(f'正在采集第{page}页')
                params = {
                    'method': 'internal.getTabDetail',
                    'serviceType': '20',
                    'reqPageNum': page,  # 做循环
                    'uri': i,
                    'maxResults': '50',
                    'zone': '',
                    'locale': 'zh',
                }
                response = requests.get('https://web-drcn.hispace.dbankcloud.com/edge/uowap/index', params=params,
                                        headers=headers)
                layoutData = response.json()['layoutData']
                if len(layoutData) == 0:
                    print(f'类型{i}采集结束,共采集{page}页')
                    break
                data = layoutData[0]['dataList']
                print(f'数据量:{len(data)}')
                #搜索这个接口信息,可添加更多信息
                for item in data:
                    data_dict['appid'] = item['appid']
                    data_dict['name'] = item['name']
                    data_dict['kindName'] = item['kindName']
                    data_dict['downCountDesc'] = item['downCountDesc']
                    data_dict['intro'] = item['intro']
                    data_dict['package'] = item['package']
                    data_dict['score'] = item['score']
                    data_dict['tagName'] = item['tagName']
                    data_dict['appVersionName'] = item['appVersionName']
                    data_dict['enterprise']  = self.data_intro(item['appid'])
                    print(data_dict)
                page += 1


    def data_intro(self,appid):
        #简介内容
        headers = self.headers
        params = {
            'method': 'internal.getTabDetail',
            'serviceType': '20',
            'reqPageNum': '1',
            'maxResults': '25',
            'uri': f'app|{appid}',
            'shareTo': '',
            'currentUrl': 'https%3A%2F%2Fappgallery.huawei.com%2Fapp%2FC107863167',
            'accessId': '',
            'appid': appid,
            'zone': '',
            'locale': 'zh',
        }

        response = requests.get('https://web-drcn.hispace.dbankcloud.com/edge/uowap/index', params=params,
                                headers=headers)
        intra_data = response.json()['layoutData']
        data = intra_data[8]['dataList']
        for item in data:
            return item['developer']

if __name__ == '__main__':
    r= HuaweiSpiderApp()
    r.data_list()

 工具类tools

import requests
import re
def getInterfaceCode():
    # 获取会话id
    headers = {
        'Accept': 'application/json, text/plain, */*',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
        'Connection': 'keep-alive',
        'Content-Type': 'application/json',
        'Origin': 'https://appgallery.huawei.com',
        'Referer': 'https://appgallery.huawei.com/',
        'Sec-Fetch-Dest': 'empty',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Site': 'cross-site',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36 Edg/129.0.0.0',
        'sec-ch-ua': '"Microsoft Edge";v="129", "Not=A?Brand";v="8", "Chromium";v="129"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
    }

    json_data = {
        'params': {},
        'zone': '',
        'locale': 'zh',
    }
    url = 'https://web-drcn.hispace.dbankcloud.com/edge/webedge/getInterfaceCode'
    response = requests.post(
        url,
        headers=headers,
        json=json_data,
    )
    InterfaceCode = response.text
    cleaned_string = re.sub(r'"', '', InterfaceCode)
    return cleaned_string

def getrealTabId(getInterfaceCode):
    #获取所有分类id
    headers = {
        'Accept': 'application/json, text/plain, */*',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
        'Connection': 'keep-alive',
        'Interface-Code': getInterfaceCode,
        'Origin': 'https://appgallery.huawei.com',
        'Referer': 'https://appgallery.huawei.com/',
        'Sec-Fetch-Dest': 'empty',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Site': 'cross-site',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36 Edg/129.0.0.0',
        'sec-ch-ua': '"Microsoft Edge";v="129", "Not=A?Brand";v="8", "Chromium";v="129"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
    }
    params = {
        'method': 'internal.getTabDetail',
        'serviceType': '20',
        'reqPageNum': '1',
        'uri': 'b2b4752f0a524fe5ad900870f88c11ed',
        'maxResults': '25',
        'zone': '',
        'locale': 'zh',
    }

    response = requests.get('https://web-drcn.hispace.dbankcloud.com/edge/uowap/index', params=params, headers=headers)
    list = response.json()['tabInfo']
    realTabId = []
    for i in list:
        for j in i['tabInfo']:
            id = j['realTabId']
            title = j['tabName']
            realTabId.append(id)
    return realTabId

  • API
  • 支持模型类型

小结

### 创建或使用华为应用商店的网络爬虫 为了创建一个能够有效抓取华为应用商店APP信息的Python爬虫程序,可以采用`requests`库来发送HTTP请求并获取网页内容,再利用`BeautifulSoup`或者`lxml`解析HTML文档提取所需数据。对于更复杂的交互场景,则可能需要用到像Selenium这样的工具模拟浏览器行为。 #### 准备工作 安装必要的第三方库可以通过pip命令完成: ```bash pip install requests beautifulsoup4 lxml selenium pandas openpyxl ``` #### 发送请求与处理响应 通过`requests.get()`函数向目标URL发起GET请求,并设置合适的headers模仿真实用户的访问环境,从而减少被反爬机制拦截的风险[^1]。 ```python import requests url = 'https://appstore.huawei.com/' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)', } response = requests.get(url, headers=headers) if response.status_code == 200: html_content = response.text else: print(f"Failed to fetch page with status code {response.status_code}") ``` #### 数据解析 一旦获得了页面源码之后,就可以运用`BeautifulSoup`对象来进行DOM树遍历操作,定位到包含应用程序详情的位置,进而抽取名称、评分等字段保存至列表结构中以便后续存储为CSV文件。 ```python from bs4 import BeautifulSoup soup = BeautifulSoup(html_content, 'html.parser') apps_info = [] for item in soup.select('.app-info'): app_name = item.find('h3').get_text(strip=True) rating = float(item.find(class_='score')['aria-label'].split()[0]) apps_info.append({ 'name': app_name, 'rating': rating, }) ``` #### 存储结果 最后一步就是调用Pandas库中的DataFrame类以及to_csv()方法实现自动化导出表格的功能了。这不仅提高了工作效率还便于后期数据分析工作的开展。 ```python import pandas as pd df_apps = pd.DataFrame(apps_info) output_file_path = './huawei_app_store_data.csv' df_apps.to_csv(output_file_path, index=False, encoding='utf_8_sig') print(f'Data has been successfully saved into "{output_file_path}"') ```
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值