单线程爬取图片(基于单进程,耗时长 os+re)
import time import random import chardet import requests import re import os from bs4 import BeautifulSoup from multiprocessing.dummy import Pool def found(s): ans = '' for i in s: if i != '<' and i != '>' and i != '/' and i != '\\' and i != '|' and i != ':' and i != '"' and i != '*' and i != '?': ans = ans + i return ans if __name__ == "__main__": headers = { "User-Agent": } url = 'https://pic.netbian.com/' urls = [] for i in range(3, 10): time.sleep(1) new_url = url if i > 1: new_url = new_url + 'index_' + str(i) + '.html' response = requests.get(url=new_url, headers=headers) response.encoding = chardet.detect(response.content)['encoding'] response = response.text path = './图片爬取数据/第'+str(i)+'页' if os.path.exists(path) == False: os.mkdir(path) ex1 = '<li><a.*?<img src="(.*?)".*?<\/a><\/li>' ex2 = '<li><a.*?<img src=.*?alt="(.*?)".*?<\/a><\/li>' list1 = re.findall(ex1, response, re.S) list2 = re.findall(ex2, response, re.S) for j in range (0,len(list1)): name = found(list2[j]) news = url+list1[j] time.sleep(1) res=requests.get(headers=headers,url=news) with open(path+'/'+name+'.jpg','wb') as f: f.write(res.content)
多线程优化(基于线程池 os+re+pool+chardet)
import time import chardet import requests import re import os from multiprocessing.dummy import Pool global headers global url def found(s): ans = '' for i in s: if i != ' ' and i != '<' and i != '>' and i != '/' and i != '\\' and i != '|' and i != ':' and i != '"' and i != '*' and i != '?': ans = ans + i return ans def Craw(url1): urls = url1[0] i = url1[1] # print(urls+' '+str(i)) response = requests.get(url=urls, headers=headers) response.encoding = chardet.detect(response.content)['encoding'] response = response.text path = '爬虫\图片爬取数据/第' + str(i) + '页' if os.path.exists(path) == False: os.mkdir(path) ex1 = '<li><a.*?<img src="(.*?)".*?<\/a><\/li>' ex2 = '<li><a.*?<img src=.*?alt="(.*?)".*?<\/a><\/li>' list1 = re.findall(ex1, response, re.S) list2 = re.findall(ex2, response, re.S) for j in range (0,len(list1)): name = found(list2[j]) news = url + list1[j] res = requests.get(headers=headers, url=news) with open(path + '/' + name + '.jpg', 'wb') as f: f.write(res.content) print("第{}页下载完毕".format(i)) if __name__ == "__main__": headers = { "User-Agent": } url = 'https://pic.netbian.com/' urls = [] for i in range(1, 65): new_url = url if i > 1: new_url = new_url + 'index_' + str(i) + '.html' t = (new_url, i) urls.append(t) start = time.time() pool = Pool(len(urls)) pool.map(Craw, urls) close = time.time() print('下载完毕,耗时', close - start) pool.close() pool.join()
1064页图片,每页20个,下载时间约为二十分钟,主要是看你电脑的性能