同步与异步下载图片,解决URL中文编码问题

图片很绿色,样板如下:
在这里插入图片描述
同步爬虫

import requests
from lxml import etree
from queue import Queue
from urllib import request
import os
import re


def parse_page(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    # text = response.text  # 解析失败用下面
    text = response.content.decode('utf-8')
    html = etree.HTML(text)
    imgs = html.xpath("//div[@class='TypeList']/ul/li//img")
    names = html.xpath("//div[@class='TypeList']/ul/li//span")
    imgurls = []
    filenames = []
    for img in imgs:
        img_url = img.get('src')
        img_url = request.quote(img_url, safe=';/?:@&=+$,', encoding='utf-8')  # 解决URL中文编码问题
        imgurls.append(img_url)  # url中存在中文是无法被request请求的
    for index, name in enumerate(names):
        filename = name.text
        filename = re.sub(r'[\s]', '_', filename)
        suffix = os.path.splitext(imgurls[index])[1]
        filenames.append(filename+suffix)
    img_download(imgurls, filenames)


def img_download(imgurls, filenames):
    for index, imgurl in enumerate(imgurls):
        request.urlretrieve(imgurl, 'images/'+filenames[index])
        print('%s下载完毕' % filenames[index])


def main():
    for i in range(1, 10):
        url = 'http://www.umei.cc/tupiandaquan/shuaigetupian/%d.htm' % i
        parse_page(url)
        break


if __name__ == '__main__':
    main()

升级异步爬虫

import requests
from lxml import etree
from queue import Queue
from urllib import request
import os
import re
import threading
import socket
socket.setdefaulttimeout(20)


class Producer(threading.Thread):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
    }

    def __init__(self, page_queue, img_queue):
        super(Producer, self).__init__()
        self.page_queue = page_queue
        self.img_queue = img_queue

    def run(self):
        flag = 0  # 定义初始空页队列为空不算,重试作业
        while True:
            if self.page_queue.empty() and flag == 1:
                print('页面队列为空,退出URL生产')
                break
            if self.page_queue.empty() and flag == 0:
                print('初始页面队列为空,重新初始化')
                continue
            url = self.page_queue.get()
            self.parse_page(url)
            flag = 1

    def parse_page(self, url):

        response = requests.get(url, headers=self.headers)
        # text = response.text  # 解析失败用下面
        text = response.content.decode('utf-8')
        html = etree.HTML(text)
        imgs = html.xpath("//div[@class='TypeList']/ul/li//img")
        names = html.xpath("//div[@class='TypeList']/ul/li//div[@class='ListTit']")
        imgurls = []
        for img in imgs:
            img_url = img.get('src')
            imgurls.append(img_url)
        for index, name in enumerate(names):
            filename = name.text
            filename = re.sub(r'[\s]', '_', filename)
            suffix = os.path.splitext(imgurls[index])[1]
            self.img_queue.put((imgurls[index], filename+suffix))


class Consumer(threading.Thread):
    def __init__(self, page_queue, img_queue):
        super(Consumer, self).__init__()
        self.page_queue = page_queue
        self.img_queue = img_queue

    def run(self):
        while True:
            if self.page_queue.empty() and self.img_queue.empty():
                print('img队列和页面队列均为空,退出下载')
                break
            self.img_download(self.img_queue)

    def img_download(self, img_queue):
        img_url, filename = img_queue.get()
        print('##################################################')
        print('loading ------正在下载图片:', filename)
        try:
            print(img_url + filename + '\n')
            img_url = request.quote(img_url, safe=';/?:@&=+$,', encoding='utf-8')
            request.urlretrieve(img_url, 'images/'+filename)
        except socket.timeout:
            print('%s首次下载失败:' % filename)
            count = 1
            while count <= 5:
                print('正在重试第%s次作业' % count)
                try:
                    request.urlretrieve(img_url, 'images/'+filename)
                    break
                except socket.timeout:
                    print('Reloading for %d times【warming】' % count)
                    count += 1
            if count > 5:
                print("%s所有重试作业失败【Faild】" % filename)
            else:
                print('%s重试作业下载完毕【info】' % filename)
        print('%s------下载完毕' % filename)
        print('************************************************************************************\n\n')


def main():
    page_queue = Queue(100)
    img_queue = Queue(1000)
    for i in range(1, 26):
        url = 'http://www.umei.cc/p/gaoqing/cn/%s.htm' % i
        page_queue.put(url)

    for x in range(3):
        t = Producer(page_queue, img_queue)
        t.start()

    for x in range(3):
        t = Consumer(page_queue, img_queue)   # 下载图片线程
        t.start()
    # str = 'http://i1.whymtj.com/uploads/tu/201901/10002/caodq91_看图王.jpg'
    # a = request.quote(str, safe=';/?:@&=+$,', encoding='utf-8')
    # print(a)


if __name__ == '__main__':
    main()

解决URL中文编码问题启发自该博文https://blog.csdn.net/mouday/article/details/80278938
嗯,藕要拿小本本记一下

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值