from multiprocessing.pool import Pool
import json
import os
import re
import requests
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/71.0.3578.98 Safari/537.36",
}
# 图片地址所在的json
pattern = re.compile('JSON.parse\("(.*?)"\),', re.S)
def get_img_urls(page_url):
"""图片地址列表爬取"""
response = requests.get(page_url, headers=headers)
text = response.text
img_str = re.findall(pattern, text)
# 获取图片标题
title = re.findall('<title>(.*?)</title>', text)[0]
# 图片保存位置
path = fr'./{title}/'
if not os.path.exists(path):
os.mkdir(path)
if img_str:
img_str = img_str[0]
# 图片地址处理
img_str = img_str.replace('u002F', '').replace('\\\\\\', '/').replace('\\', '')
img_list = json.loads(img_str)
img_urls = []
for sub_images in img_list['sub_images']:
img_urls.append(sub_images['url_list'][2]['url'])
return path, img_urls
def down_img(path, img_url):
_, name = os.path.split(img_url)
path_name = os.path.join(path, name)
# print(path_name, name)
response = requests.get(img_url)
content = response.content
with open(f'{path_name}.jpg', 'wb') as f:
print('下载:', name)
f.write(content)
if __name__ == '__main__':
pool = Pool(10)
url = 'https://www.toutiao.com/a6810668734764548621/'
path, img_urls = get_img_urls(url)
for img_url in img_urls:
# 非阻塞
pool.apply_async(down_img, args=(path, img_url))
# 阻塞式
# pool.apply(down_img, args=(path, img_url))
pool.close()
# 调用join之前,先调用close函数,否则会出错。
# 执行完close后不会有新的进程加入到pool,join函数等待所有子进程结束
pool.join()
进程池爬取今日头条图片
最新推荐文章于 2024-07-12 17:03:16 发布