使用urllib,re,queue,threading,bs4,requests多线程队列爬取图片到本地保存

最新推荐文章于 2024-05-04 16:17:03 发布

j0101_

最新推荐文章于 2024-05-04 16:17:03 发布

阅读量133

点赞数

分类专栏： Python网络安全

本文链接：https://blog.csdn.net/jexsen/article/details/114285101

版权

Python网络安全专栏收录该内容

21 篇文章 0 订阅

订阅专栏

本文介绍如何利用Python的urllib, re, queue, threading, bs4和requests库，结合多线程和队列技术，高效地爬取网页图片并保存到本地。" 131199876,9354391,C++实现PCL点到面ICP配准算法详解,"['算法', 'C++', '计算机视觉', '3D']

摘要由CSDN通过智能技术生成

使用urllib,re,queue,threading,bs4,requests多线程队列爬取图片到本地保存

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from urllib import request
from bs4 import BeautifulSoup as bs
import threading
from queue import Queue
import requests
import re
import os
import socket

url = 'http://www.ccppg.cn/books/ts/index_2.html'

# 设置超时时间为30s
socket.setdefaulttimeout(5)


class GetPic(threading.Thread):

    def __init__(self, queue):
        threading.Thread.__init__(self)
        self.queue = queue

    def run(self):
        while not self.queue.empty():
            url = self.queue.get_nowait()
            self.spider(url)

    def spider(self, url):
        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36',
            'Referer': 'http://www.ccppg.cn/'
        }
        r = requests.get(url=url, headers=headers)
        r.encoding = 'GBK'
        soup = bs(r.text, 'html.parser')
        y = soup.find_all('img', src=re.compile(r'.jpg'))
        for i in y:
            # print(i)
            if 'height="159"' in str(i):
                t = re.findall('src="(.*?)"', str(i))
                for tt in t:
                    # 拿到图片链接
                    # print(tt)

                    name = tt.split('/')[-1]

                    dirname = 'pic'

                    filepath = dirname + '/' + name

                    if not os.path.exists(dirname):
                        os.mkdir(dirname)
                    try:
                        opener = request.build_opener()
                        opener.addheaders = [('User-Agent',
                                              'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36')]

                        request.install_opener(opener)
                        request.urlretrieve(url=tt, filename=filepath + name)
                        # time.sleep(0.5)
                    except socket.timeout:
                        count = 1
                        while count <= 5:
                            try:
                                request.urlretrieve(url=tt, filename=filepath + name)
                                break
                            except socket.timeout:
                                err_info = 'Reloading for %d time' % count if count == 1 else 'Reloading for %d times' % count
                                print(err_info)
                                count += 1
                        if count > 5:
                            print("download job failed!")


def main():
    queue = Queue(500)
    for i in range(2, 18):
        # 压入图片的链接
        queue.put('http://www.ccppg.cn/books/ts/index_' + str(i) + '.html')

    threads = []

    thread_count = 50

    for i in range(thread_count):
        threads.append(GetPic(queue))

    for t in threads:
        t.start()
        t.join()


if __name__ == '__main__':
    main()