Python异步抓取MSDN下载链接

本文介绍了一个使用Python的异步库aiohttp和asyncio进行网络请求,从MSDNItellyou.cn网站抓取菜单、类别和子项信息,并将数据整理后写入Excel的工作流程。
摘要由CSDN通过智能技术生成
import aiohttp
import asyncio
from lxml import etree
import json
import pandas as pd
import os

async def fetch(session, url, method='GET', data=None, headers=None):
    if method == 'GET':
        async with session.get(url, headers=headers) as response:
            return await response.text()
    elif method == 'POST':
        async with session.post(url, data=data, headers=headers) as response:
            return await response.text()

async def start_requests(session, url, headers):
    html = await fetch(session, url, headers=headers)
    tree_data = etree.HTML(html)
    menuid_lst = tree_data.xpath('//h4[@class="panel-title"]/a/@data-menuid')
    title_lst = tree_data.xpath('//*[@id="accordion"]/div/div/h4/a/text()')
    return menuid_lst, title_lst

async def get_Long(session, url, id, headers):
    form_data = {'id': id}
    response = await fetch(session, url, method='POST', data=form_data, headers=headers)
    try:
        result = json.loads(response)
        id = result['result'][0]['id']
        return id
    except:
        return None

async def get_list(session, url, get_category_id, get_long_id, headers):
    form_data = {'id': get_category_id, 'lang': get_long_id, 'filter': 'true'}
    response = await fetch(session, url, method='POST', data=form_data, headers=headers)
    result = json.loads(response)
    return result.get('result', [])

async def get_category(session, url, menuid, headers):
    from_data = {'id': menuid}
    response = await fetch(session, url, method='POST', data=from_data, headers=headers)
    return json.loads(response)


def write_excel(data):
    df = pd.DataFrame(data)
    file_path = './output.xlsx'
    mode = 'a' if os.path.exists(file_path) else 'w'
    # 仅当文件存在(即追加模式)时设置 if_sheet_exists 参数
    if mode == 'a':
        with pd.ExcelWriter(file_path, mode=mode, engine='openpyxl', if_sheet_exists='replace') as writer:
            df.to_excel(writer, sheet_name='Sheet1', index=False)
    else:
        with pd.ExcelWriter(file_path, mode=mode, engine='openpyxl') as writer:
            df.to_excel(writer, sheet_name='Sheet1', index=False)

async def main():
    cookies = {
        '.AspNetCore.Antiforgery.fvX-_i-L-U0': 'CfDJ8E4IS3mPUJpJkOaPn3XTDaJ-ynJ_lMx3vfNdyGAOVg78COVn3dJhZAPbjIya_d8SlWF-su8Jz3Qg0XpXYg857pqybjcrKvB2d0KK3yjLa_tnuIZ8qZJ2vwutrfYYWBbPp9kcPhvQvku_WA5t1VN0HIc',
        'Hm_lvt_8688ca4bc18cbc647c9c68fdaef6bc24': '1713427035,1713755571,1713952611',
        'Hm_lpvt_8688ca4bc18cbc647c9c68fdaef6bc24': '1713952611',
    }

    headers = {
        'accept': '*/*',
        'accept-language': 'zh-CN,zh;q=0.9',
        'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
        # 'cookie': '.AspNetCore.Antiforgery.fvX-_i-L-U0=CfDJ8E4IS3mPUJpJkOaPn3XTDaJ-ynJ_lMx3vfNdyGAOVg78COVn3dJhZAPbjIya_d8SlWF-su8Jz3Qg0XpXYg857pqybjcrKvB2d0KK3yjLa_tnuIZ8qZJ2vwutrfYYWBbPp9kcPhvQvku_WA5t1VN0HIc; Hm_lvt_8688ca4bc18cbc647c9c68fdaef6bc24=1713427035,1713755571,1713952611; Hm_lpvt_8688ca4bc18cbc647c9c68fdaef6bc24=1713952611',
        'origin': 'https://msdn.itellyou.cn',
        'priority': 'u=1, i',
        'referer': 'https://msdn.itellyou.cn/',
        'sec-ch-ua': '"Chromium";v="124", "Google Chrome";v="124", "Not-A.Brand";v="99"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"macOS"',
        'sec-fetch-dest': 'empty',
        'sec-fetch-mode': 'cors',
        'sec-fetch-site': 'same-origin',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
        'x-csrf-token': 'CfDJ8E4IS3mPUJpJkOaPn3XTDaKTYYay4LzGBkSRx0eLD1RuJbAY95uMm9OqsDcQUwIW3NX1kWDqlpjfPAglXA0ltQvy-gPddMbWciVI5QY_dj5lpo8-M1UXeNDTi9eYsBrACka5OaJoT27eagi7MsE55t0',
        'x-requested-with': 'XMLHttpRequest',
    }
    async with aiohttp.ClientSession(cookies=cookies) as session:
        menuid_lst, title_lst = await start_requests(session, 'https://msdn.itellyou.cn/', headers)
        all_data = []
        for menuid, title in zip(menuid_lst,title_lst):
            category_data = await get_category(session, 'https://msdn.itellyou.cn/Index/GetCategory', menuid,headers)
            # Assume category_data contains the information needed to call get_Long and get_list
            for item in category_data:
                lang_id = await get_Long(session, 'https://msdn.itellyou.cn/Index/GetLang', item['id'], headers)
                list_data = await get_list(session, 'https://msdn.itellyou.cn/Index/GetList', item['id'], lang_id, headers)
                # Append results for writing to Excel
                extracted_data = [{'一级标题':title , '二级标题' : item['name'] , '三级标题' : i['name'] , 'url' : i['url']} for i in list_data]
                all_data.extend(extracted_data)
            print(f'{title} has been completed.')
        # Call to write data to Excel once all data is collected
        write_excel(all_data)
        print("All data has been written to Excel.")


if __name__ == '__main__':
    asyncio.run(main())

  • 25
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值