简易的图片采集器

单线程爬取图片(基于单进程,耗时长 os+re)

import time
import random
import chardet
import requests
import re
import os
from bs4 import BeautifulSoup
from multiprocessing.dummy import Pool
​
​
def found(s):
    ans = ''
    for i in s:
        if i != '<' and i != '>' and i != '/' and i != '\\' and i != '|' and i != ':' and i != '"' and i != '*' and i != '?':
            ans = ans + i
    return ans
​
​
if __name__ == "__main__":
    headers = {
        "User-Agent": 
    }
    url = 'https://pic.netbian.com/'
    urls = []
    for i in range(3, 10):
        time.sleep(1)
        new_url = url
        if i > 1:
            new_url = new_url + 'index_' + str(i) + '.html'
        response = requests.get(url=new_url, headers=headers)
        response.encoding = chardet.detect(response.content)['encoding']
        response = response.text
        path = './图片爬取数据/第'+str(i)+'页'
        if os.path.exists(path) == False:
            os.mkdir(path)
        ex1 = '<li><a.*?<img src="(.*?)".*?<\/a><\/li>'
        ex2 = '<li><a.*?<img src=.*?alt="(.*?)".*?<\/a><\/li>'
        list1 = re.findall(ex1, response, re.S)
        list2 = re.findall(ex2, response, re.S)
        for j in range (0,len(list1)):
            name = found(list2[j])
            news = url+list1[j]
            time.sleep(1)
            res=requests.get(headers=headers,url=news)
            with open(path+'/'+name+'.jpg','wb') as f:
                f.write(res.content)
​

多线程优化(基于线程池 os+re+pool+chardet)

import time
import chardet
import requests
import re
import os
from multiprocessing.dummy import Pool
​
global headers
global url
​
def found(s):
    ans = ''
    for i in s:
        if i != ' ' and i != '<' and i != '>' and i != '/' and i != '\\' and i != '|' and i != ':' and i != '"' and i != '*' and i != '?':
            ans = ans + i
    return ans
​
def Craw(url1):
    urls = url1[0]
    i = url1[1]
    # print(urls+' '+str(i))
    response = requests.get(url=urls, headers=headers)
    response.encoding = chardet.detect(response.content)['encoding']
    response = response.text
    path = '爬虫\图片爬取数据/第' + str(i) + '页'
    if os.path.exists(path) == False:
        os.mkdir(path)
    ex1 = '<li><a.*?<img src="(.*?)".*?<\/a><\/li>'
    ex2 = '<li><a.*?<img src=.*?alt="(.*?)".*?<\/a><\/li>'
    list1 = re.findall(ex1, response, re.S)
    list2 = re.findall(ex2, response, re.S)
    for j in range (0,len(list1)):
        name = found(list2[j])
        news = url + list1[j]
        res = requests.get(headers=headers, url=news)
        with open(path + '/' + name + '.jpg', 'wb') as f:
            f.write(res.content)
    print("第{}页下载完毕".format(i))
​
​
if __name__ == "__main__":
    headers = {
        "User-Agent": 
    }
    url = 'https://pic.netbian.com/'
    urls = []
    for i in range(1, 65):
        new_url = url
        if i > 1:
            new_url = new_url + 'index_' + str(i) + '.html'
        t = (new_url, i)
        urls.append(t)
    start = time.time()
    pool = Pool(len(urls))
    pool.map(Craw, urls)
    close = time.time()
    print('下载完毕,耗时', close - start)
    pool.close()
    pool.join()
​

1064页图片,每页20个,下载时间约为二十分钟,主要是看你电脑的性能

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值