进程池爬取今日头条图片

from multiprocessing.pool import Pool
import json
import os
import re
import requests

headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/71.0.3578.98 Safari/537.36",
}
# 图片地址所在的json
pattern = re.compile('JSON.parse\("(.*?)"\),', re.S)


def get_img_urls(page_url):
    """图片地址列表爬取"""
    response = requests.get(page_url, headers=headers)
    text = response.text
    img_str = re.findall(pattern, text)
    # 获取图片标题
    title = re.findall('<title>(.*?)</title>', text)[0]
    # 图片保存位置
    path = fr'./{title}/'
    if not os.path.exists(path):
        os.mkdir(path)
    if img_str:
        img_str = img_str[0]

    # 图片地址处理
    img_str = img_str.replace('u002F', '').replace('\\\\\\', '/').replace('\\', '')
    img_list = json.loads(img_str)
    img_urls = []
    for sub_images in img_list['sub_images']:
        img_urls.append(sub_images['url_list'][2]['url'])

    return path, img_urls


def down_img(path, img_url):
    _, name = os.path.split(img_url)
    path_name = os.path.join(path, name)
    # print(path_name, name)
    response = requests.get(img_url)
    content = response.content
    with open(f'{path_name}.jpg', 'wb') as f:
        print('下载:', name)
        f.write(content)


if __name__ == '__main__':
    pool = Pool(10)
    url = 'https://www.toutiao.com/a6810668734764548621/'
    path, img_urls = get_img_urls(url)
    for img_url in img_urls:
        # 非阻塞
        pool.apply_async(down_img, args=(path, img_url))
        # 阻塞式
        # pool.apply(down_img, args=(path, img_url))

    pool.close()
    # 调用join之前,先调用close函数,否则会出错。
    # 执行完close后不会有新的进程加入到pool,join函数等待所有子进程结束
    pool.join()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值