爬虫进阶:多线程爬虫

AI应用开发相关目录

本专栏包括AI应用开发相关内容分享,包括不限于AI算法部署实施细节、AI应用后端分析服务相关概念及开发技巧、AI应用后端应用服务相关概念及开发技巧、AI应用前端实现路径及开发技巧
适用于具备一定算法及Python使用基础的人群

  1. AI应用开发流程概述
  2. Visual Studio Code及Remote Development插件远程开发
  3. git开源项目的一些问题及镜像解决办法
  4. python实现UDP报文通信
  5. python实现日志生成及定期清理
  6. Linux终端命令Screen常见用法
  7. python实现redis数据存储
  8. python字符串转字典
  9. python实现文本向量化及文本相似度计算
  10. python对MySQL数据的常见使用
  11. 一文总结python的异常数据处理示例
  12. 基于selenium和bs4的通用数据采集技术(附代码)
  13. 基于python的知识图谱技术
  14. 一文理清python学习路径
  15. Linux、Git、Docker常用指令
  16. linux和windows系统下的python环境迁移
  17. linux下python服务定时(自)启动
  18. windows下基于python语言的TTS开发
  19. python opencv实现图像分割
  20. python使用API实现word文档翻译
  21. yolo-world:”目标检测届大模型“
  22. 爬虫进阶:多线程爬虫


import asyncio
import json
import time
import random
from tqdm import tqdm
from fake_useragent import UserAgent
from curl_cffi.requests import AsyncSession




FILENAME = "data/data2.json"

# 设置请求头
ua = UserAgent()
headers = {
    'User-Agent': ua.random,
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
    'Accept-Encoding': 'gzip, deflate',
    'Connection': 'close',
    'Upgrade-Insecure-Requests': '1',
}

async def fetch_data(session, page_id):
    url = f"https://ms.app.jiaodong.net/public/index.php/pc/v1/askInfo/{page_id}"
    retries = 3
    for _ in range(retries):
        try:
            response = await session.get(url, headers=headers, impersonate='chrome')
            if response.status_code == 200:
                return response.json()
            else:
                print(f"Error {response.status_code} for page_id {page_id}")
        except Exception as e:
            print(f"Exception for page_id {page_id}: {e}")
        await asyncio.sleep(random.uniform(1, 3))
    return None

def save_data(data):
    with open(FILENAME, 'a', encoding='utf-8') as f:
        for item in data:
            json.dump(item, f, ensure_ascii=False, indent=4)
            f.write('\n')
        
async def fetch_and_store_data(start_id, end_id, max_concurrent_tasks=10):
    all_data = []
    async with AsyncSession() as session:
        tasks = []
        semaphore = asyncio.Semaphore(max_concurrent_tasks)

        async def sem_fetch(page_id):
            async with semaphore:
                data = await fetch_data(session, page_id)
                return data

        for page_id in range(start_id, end_id + 1):
            tasks.append(sem_fetch(page_id))
        
        for task in asyncio.as_completed(tasks):
            try:
                data = await task
                if data["code"] == 200:
                    all_data.append(data)
                    if len(all_data) >= 10:
                        save_data(all_data)
                        all_data = []
            except Exception as e:
                print(f"Exception occurred: {e}")

            # 随机延迟,防止被ban
            delay = random.uniform(0.02, 0.1)
            await asyncio.sleep(delay)
        # 最后一批数据保存
        if all_data:
            save_data(all_data)

if __name__ == "__main__":
    import sys
    # 隐藏警告
    if sys.platform.startswith('win'):
        asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
        
    end_id = 1242122 - 50
    start_id = 927384
    batch_size = 50

    for end_id in tqdm(range(end_id, start_id - 1, -batch_size), desc=f"Fetching data(batch_size:{batch_size})", total=round((end_id-start_id)/batch_size)):
        start_id = end_id - batch_size
        asyncio.run(fetch_and_store_data(start_id, end_id, max_concurrent_tasks=10))



补充示例

import requests
from bs4 import BeautifulSoup
import json
import re
import multiprocessing
import xlwt
import time

def netease_spider(headers, news_class, i):
    if i == 1:
        url = "https://temp.163.com/special/00804KVA/cm_{0}.js?callback=data_callback".format(news_class)
    else:
        url = 'https://temp.163.com/special/00804KVA/cm_{0}_0{1}.js?callback=data_callback'.format(news_class, str(i))
    pages = []
    try:
        response = requests.get(url, headers=headers).text
    except:
        print("当前主页面爬取失败")
        return
    start = response.index('[')
    end = response.index('])') + 1
    data = json.loads(response[start:end])
    try:
        for item in data:
            title = item['title']
            docurl = item['docurl']
            label = item['label']
            source = item['source']
            doc = requests.get(docurl, headers=headers).text
            soup = BeautifulSoup(doc, 'lxml')
            news = soup.find_all('div', class_='post_body')[0].text
            news = re.sub('\s+', '', news).strip()
            pages.append([title, label, source, news])
            time.sleep(3)

    except:
        print("当前详情页面爬取失败")
        pass

    return pages


def run(news_class, nums):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.63 Safari/537.36'}
    tmp_result = []
    for i in range(1, nums + 1):
        tmp = netease_spider(headers, news_class, i)
        if tmp:
            tmp_result.append(tmp)

    return tmp_result


if __name__ == '__main__':

    book = xlwt.Workbook(encoding='utf-8')
    sheet = book.add_sheet('网易新闻数据')
    sheet.write(0, 0, '文章标题')
    sheet.write(0, 1, '文章标签')
    sheet.write(0, 2, '文章来源')
    sheet.write(0, 3, '文章内容')

    news_calsses = {'guonei', 'guoji'}
    nums = 3
    index = 1
    pool = multiprocessing.Pool(30)
    for news_class in news_calsses:
        result = pool.apply_async(run, (news_class, nums))
        for pages in result.get():
            for page in pages:
                if page:
                    title, label, source, news = page
                    sheet.write(index, 0, title)
                    sheet.write(index, 1, label)
                    sheet.write(index, 2, source)
                    sheet.write(index, 3, news)
                    index += 1
    pool.close()
    pool.join()
    print("共爬取{0}篇新闻".format(index))
    book.save(u"网易新闻爬虫结果.xls")
  • 5
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

写代码的中青年

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值