简易的图片采集器

最新推荐文章于 2024-06-25 14:57:14 发布

凡人自甘堕落

最新推荐文章于 2024-06-25 14:57:14 发布

阅读量98

点赞数

文章标签： python 开发语言网络爬虫

本文链接：https://blog.csdn.net/qq_64177153/article/details/133274243

版权

单线程爬取图片（基于单进程，耗时长 os+re）

import time
import random
import chardet
import requests
import re
import os
from bs4 import BeautifulSoup
from multiprocessing.dummy import Pool


def found(s):
    ans = ''
    for i in s:
        if i != '<' and i != '>' and i != '/' and i != '\\' and i != '|' and i != ':' and i != '"' and i != '*' and i != '?':
            ans = ans + i
    return ans


if __name__ == "__main__":
    headers = {
        "User-Agent": 
    }
    url = 'https://pic.netbian.com/'
    urls = []
    for i in range(3, 10):
        time.sleep(1)
        new_url = url
        if i > 1:
            new_url = new_url + 'index_' + str(i) + '.html'
        response = requests.get(url=new_url, headers=headers)
        response.encoding = chardet.detect(response.content)['encoding']
        response = response.text
        path = './图片爬取数据/第'+str(i)+'页'
        if os.path.exists(path) == False:
            os.mkdir(path)
        ex1 = '<li><a.*?<img src="(.*?)".*?<\/a><\/li>'
        ex2 = '<li><a.*?<img src=.*?alt="(.*?)".*?<\/a><\/li>'
        list1 = re.findall(ex1, response, re.S)
        list2 = re.findall(ex2, response, re.S)
        for j in range (0,len(list1)):
            name = found(list2[j])
            news = url+list1[j]
            time.sleep(1)
            res=requests.get(headers=headers,url=news)
            with open(path+'/'+name+'.jpg','wb') as f:
                f.write(res.content)

多线程优化（基于线程池 os+re+pool+chardet）

import time
import chardet
import requests
import re
import os
from multiprocessing.dummy import Pool

global headers
global url

def found(s):
    ans = ''
    for i in s:
        if i != ' ' and i != '<' and i != '>' and i != '/' and i != '\\' and i != '|' and i != ':' and i != '"' and i != '*' and i != '?':
            ans = ans + i
    return ans

def Craw(url1):
    urls = url1[0]
    i = url1[1]
    # print(urls+' '+str(i))
    response = requests.get(url=urls, headers=headers)
    response.encoding = chardet.detect(response.content)['encoding']
    response = response.text
    path = '爬虫\图片爬取数据/第' + str(i) + '页'
    if os.path.exists(path) == False:
        os.mkdir(path)
    ex1 = '<li><a.*?<img src="(.*?)".*?<\/a><\/li>'
    ex2 = '<li><a.*?<img src=.*?alt="(.*?)".*?<\/a><\/li>'
    list1 = re.findall(ex1, response, re.S)
    list2 = re.findall(ex2, response, re.S)
    for j in range (0,len(list1)):
        name = found(list2[j])
        news = url + list1[j]
        res = requests.get(headers=headers, url=news)
        with open(path + '/' + name + '.jpg', 'wb') as f:
            f.write(res.content)
    print("第{}页下载完毕".format(i))


if __name__ == "__main__":
    headers = {
        "User-Agent": 
    }
    url = 'https://pic.netbian.com/'
    urls = []
    for i in range(1, 65):
        new_url = url
        if i > 1:
            new_url = new_url + 'index_' + str(i) + '.html'
        t = (new_url, i)
        urls.append(t)
    start = time.time()
    pool = Pool(len(urls))
    pool.map(Craw, urls)
    close = time.time()
    print('下载完毕，耗时', close - start)
    pool.close()
    pool.join()

1064页图片，每页20个，下载时间约为二十分钟，主要是看你电脑的性能

凡人自甘堕落

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
简易的图片采集器

path = '爬虫\图片爬取数据/第' + str(i) + '页'path = './图片爬取数据/第'+str(i)+'页'print('下载完毕，耗时', close - start)1064页图片，每页20个，下载时间约为二十分钟，主要是看你电脑的性能。print("第{}页下载完毕".format(i))
复制链接

扫一扫