爬虫进阶：多线程爬虫

写代码的中青年

已于 2024-05-29 15:29:22 修改

阅读量280

点赞数 5

分类专栏： AI应用开发文章标签：爬虫 python 多线程数据采集

于 2024-05-22 16:11:23 首次发布

本文链接：https://blog.csdn.net/qq_43128256/article/details/139122835

版权

AI应用开发专栏收录该内容

28 篇文章 0 订阅

订阅专栏

AI应用开发相关目录

本专栏包括AI应用开发相关内容分享，包括不限于AI算法部署实施细节、AI应用后端分析服务相关概念及开发技巧、AI应用后端应用服务相关概念及开发技巧、AI应用前端实现路径及开发技巧
适用于具备一定算法及Python使用基础的人群

AI应用开发流程概述
Visual Studio Code及Remote Development插件远程开发
git开源项目的一些问题及镜像解决办法
python实现UDP报文通信
python实现日志生成及定期清理
Linux终端命令Screen常见用法
python实现redis数据存储
python字符串转字典
python实现文本向量化及文本相似度计算
python对MySQL数据的常见使用
一文总结python的异常数据处理示例
基于selenium和bs4的通用数据采集技术（附代码）
基于python的知识图谱技术
一文理清python学习路径
Linux、Git、Docker常用指令
linux和windows系统下的python环境迁移
linux下python服务定时（自）启动
windows下基于python语言的TTS开发
python opencv实现图像分割
python使用API实现word文档翻译
yolo-world：”目标检测届大模型“
爬虫进阶：多线程爬虫

文章目录

import asyncio
import json
import time
import random
from tqdm import tqdm
from fake_useragent import UserAgent
from curl_cffi.requests import AsyncSession




FILENAME = "data/data2.json"

# 设置请求头
ua = UserAgent()
headers = {
    'User-Agent': ua.random,
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
    'Accept-Encoding': 'gzip, deflate',
    'Connection': 'close',
    'Upgrade-Insecure-Requests': '1',
}

async def fetch_data(session, page_id):
    url = f"https://ms.app.jiaodong.net/public/index.php/pc/v1/askInfo/{page_id}"
    retries = 3
    for _ in range(retries):
        try:
            response = await session.get(url, headers=headers, impersonate='chrome')
            if response.status_code == 200:
                return response.json()
            else:
                print(f"Error {response.status_code} for page_id {page_id}")
        except Exception as e:
            print(f"Exception for page_id {page_id}: {e}")
        await asyncio.sleep(random.uniform(1, 3))
    return None

def save_data(data):
    with open(FILENAME, 'a', encoding='utf-8') as f:
        for item in data:
            json.dump(item, f, ensure_ascii=False, indent=4)
            f.write('\n')
        
async def fetch_and_store_data(start_id, end_id, max_concurrent_tasks=10):
    all_data = []
    async with AsyncSession() as session:
        tasks = []
        semaphore = asyncio.Semaphore(max_concurrent_tasks)

        async def sem_fetch(page_id):
            async with semaphore:
                data = await fetch_data(session, page_id)
                return data

        for page_id in range(start_id, end_id + 1):
            tasks.append(sem_fetch(page_id))
        
        for task in asyncio.as_completed(tasks):
            try:
                data = await task
                if data["code"] == 200:
                    all_data.append(data)
                    if len(all_data) >= 10:
                        save_data(all_data)
                        all_data = []
            except Exception as e:
                print(f"Exception occurred: {e}")

            # 随机延迟，防止被ban
            delay = random.uniform(0.02, 0.1)
            await asyncio.sleep(delay)
        # 最后一批数据保存
        if all_data:
            save_data(all_data)

if __name__ == "__main__":
    import sys
    # 隐藏警告
    if sys.platform.startswith('win'):
        asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
        
    end_id = 1242122 - 50
    start_id = 927384
    batch_size = 50

    for end_id in tqdm(range(end_id, start_id - 1, -batch_size), desc=f"Fetching data(batch_size:{batch_size})", total=round((end_id-start_id)/batch_size)):
        start_id = end_id - batch_size
        asyncio.run(fetch_and_store_data(start_id, end_id, max_concurrent_tasks=10))

补充示例

import requests
from bs4 import BeautifulSoup
import json
import re
import multiprocessing
import xlwt
import time

def netease_spider(headers, news_class, i):
    if i == 1:
        url = "https://temp.163.com/special/00804KVA/cm_{0}.js?callback=data_callback".format(news_class)
    else:
        url = 'https://temp.163.com/special/00804KVA/cm_{0}_0{1}.js?callback=data_callback'.format(news_class, str(i))
    pages = []
    try:
        response = requests.get(url, headers=headers).text
    except:
        print("当前主页面爬取失败")
        return
    start = response.index('[')
    end = response.index('])') + 1
    data = json.loads(response[start:end])
    try:
        for item in data:
            title = item['title']
            docurl = item['docurl']
            label = item['label']
            source = item['source']
            doc = requests.get(docurl, headers=headers).text
            soup = BeautifulSoup(doc, 'lxml')
            news = soup.find_all('div', class_='post_body')[0].text
            news = re.sub('\s+', '', news).strip()
            pages.append([title, label, source, news])
            time.sleep(3)

    except:
        print("当前详情页面爬取失败")
        pass

    return pages


def run(news_class, nums):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.63 Safari/537.36'}
    tmp_result = []
    for i in range(1, nums + 1):
        tmp = netease_spider(headers, news_class, i)
        if tmp:
            tmp_result.append(tmp)

    return tmp_result


if __name__ == '__main__':

    book = xlwt.Workbook(encoding='utf-8')
    sheet = book.add_sheet('网易新闻数据')
    sheet.write(0, 0, '文章标题')
    sheet.write(0, 1, '文章标签')
    sheet.write(0, 2, '文章来源')
    sheet.write(0, 3, '文章内容')

    news_calsses = {'guonei', 'guoji'}
    nums = 3
    index = 1
    pool = multiprocessing.Pool(30)
    for news_class in news_calsses:
        result = pool.apply_async(run, (news_class, nums))
        for pages in result.get():
            for page in pages:
                if page:
                    title, label, source, news = page
                    sheet.write(index, 0, title)
                    sheet.write(index, 1, label)
                    sheet.write(index, 2, source)
                    sheet.write(index, 3, news)
                    index += 1
    pool.close()
    pool.join()
    print("共爬取{0}篇新闻".format(index))
    book.save(u"网易新闻爬虫结果.xls")