python之多线程图片抓取

最新推荐文章于 2024-07-27 12:04:01 发布

codsing

最新推荐文章于 2024-07-27 12:04:01 发布

阅读量3.1k

点赞数

分类专栏： python 文章标签： python

本文链接：https://blog.csdn.net/lixing112233/article/details/104734898

版权

python 专栏收录该内容

6 篇文章 1 订阅

订阅专栏

站点目标：https://www.2717.com/

编写脚本：

import os
import threading
import time

import requests
from bs4 import BeautifulSoup

headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',

}

def do_request(url, tries=3):
    # 添加重试机制
    i = 0
    while i < tries:
        try:
            res = requests.get(url, timeout=5)
            return res
        except requests.exceptions.RequestException as e:
            i += 1
            print('request error.', e)

    return False

def download_local(url, img_file):
    '''
    下载图片到本地
    :param url: 远程图片地址
    :param img_file: 本地存储地址
    '''

    res = do_request(url)
    if res is False:
        print(f'请求失败：', url)
        return

    img = open(img_file, 'wb')
    img.write(res.content)
    img.close()


def save_img(url, name, page):
    '''
    保存图片
    :param url: 远程图片地址
    :param name: 图片名称
    :param page: 页码
    '''
    print(url, name)
    # 目录不存在，则创建
    img_path = f'./imgs/{page}/'
    if not os.path.isdir(img_path):
        os.mkdir(img_path, mode=755)

    img_ext = os.path.splitext(url)[1] # 获取图片后缀
    img_file = f'{img_path}{name}.{img_ext}'
    print(img_file)

    # 文件已经存在，无需下载
    if os.path.isfile(img_file):
        print(f'img is exist. img: {img_file}')
        return True

    download_local(url, img_file)



def crawl_images(page):
    '''
    抓取图片
    :param page: 分页
    '''
    url = f'https://www.2717.com/ent/meinvtupian/list_11_{page}.html'
    print(url)
    response = requests.get(url, headers=headers)
    response.encoding = response.apparent_encoding # 避免乱码（默认iso-8859-1）
    # print(response.text)
    soup = BeautifulSoup(response.text, 'lxml')
    lists = soup.find('div', class_='MeinvTuPianBox').find_all('a', class_='MMPic')
    for item in lists:
        print('###'*20)
        img_url = item.find('img')['src']
        img_name = item.find('img')['alt']
        # save_img(img_url, img_name)
        # 多线程，加快抓取
        t = threading.Thread(target=save_img, args=[img_url, img_name, page])
        t.start()



def main():

    for page in range(251, 1, -1):
    # for page in [34]:
        print(f'current page: {page}')
        crawl_images(page)
        time.sleep(0.01)



if __name__ == '__main__':
    print('=============== crawl images start ==============')
    start = time.time()
    main()
    print('=============== crawl images end ==============')
    print('Time usage:{0:.3f}'.format(time.time()-start))

抓取效果：