python 线程池爬虫案列，爬取美女图片

eranguangming

已于 2022-10-13 13:29:26 修改

阅读量193

点赞数

分类专栏： python 文章标签： python 爬虫开发语言

于 2022-10-13 13:23:50 首次发布

本文链接：https://blog.csdn.net/eranguangming/article/details/127299555

版权

python 专栏收录该内容

12 篇文章 0 订阅

订阅专栏

废话不多说，直接上代码

import os
import time
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from bs4 import BeautifulSoup

def WebPageRequests(url):
    res = requests.get(url,headers=header)
    if res.status_code == 200:
        soup = BeautifulSoup(res.text,features='lxml')
        img = soup.select('.entry-content img')
        title = soup.select('.entry-header h1')[0].get_text()
        for i in img:
            src = i.attrs['src']
            im = requests.get(src,headers=header)
            name = src[-20:]
            print(name,src)
            with open(f'{path}/{title}{name}','wb') as f:
                f.write(im.content)


def WebPage(url):
    res = requests.get(url,headers=header)
    if res.status_code == 200:
        soup = BeautifulSoup(res.text,features='lxml')
        h2a = soup.select('.post-info h2 a')
        urls_list = []
        for a in h2a:
            href = a.attrs['href']
            urls_list.append(href)
        return urls_list

if __name__ == '__main__':
    start = time.time()
    path = './小姐姐'
    if not os.path.exists(path):
        os.mkdir(path)
    header = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
    }
    urls = [f'https://www.jdlingyu.com/tag/%e6%b8%85%e6%96%b0%e7%be%8e%e5%a5%b3/page/{url}' for url in range(1,24)]
    with ThreadPoolExecutor(max_workers=50) as pool:
        res = [pool.submit(WebPage,url) for url in urls]
        for res_ in as_completed(res):
            urls_page = res_.result()
            res_page = [pool.submit(WebPageRequests, url) for url in urls_page]
    end = time.time()
    print('耗时：%.2f' %(end-start))