前言
在爬取的过程中,主要的速度影响是脚本访问服务器的这段时间,但如果可以分工合作,同时进行,速度可能会的到大幅度提升,本文章将使用多线程对图片的下载进行加速。
分析
在往期的爬虫程序中,将图片下载到本地的时间最久,需要花费大约30分钟左右(不同的机器、不同的网络环境不同),所有接下来只针对图片进行多线程划分任务。
在此之前,需要了解数组等份划分,可按照输入的n,对数组进行n等分,这里参考了一篇牛人的文章,附上原文链接https://blog.csdn.net/weixin_39220714/article/details/88619391
import math
def list_split(lists, n):
"""
数组n等分
:param lists: 需要分割的数组
:param n: n份
:return: res一个包含n等份的数组
"""
res = []
length = len(lists)
for i in range(n):
one_list = lists[math.floor(i / n * length):math.floor((i + 1) / n * length)]
res.append(one_list)
return res
# 运行结果
if __name__ == '__main__':
arr = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
print(list_split(arr, 6))
# [[1], [2, 3], [4, 5], [6], [7, 8], [9, 10]]
有了这个好用的函数,我们就可以将前面爬取到的数组,等分分成几份(n线程),每份分配一个线程,每个线程可以并发执行,这样就可以在一定程度上加快爬取的速度。
环境
Python == 3.8.5
BeautifulSoup
PyQuery
tqdm
threading
代码
"""
在2.1基础上,下载图片过程中加入多线程
"""
import requests
from bs4 import BeautifulSoup
import os, shutil, threading, math
from pyquery import PyQuery as pq
from tqdm import tqdm
def make_dir(dir):
if os.path.exists(dir):
shutil.rmtree(dir, ignore_errors=True)
os.makedirs(dir)
# print('Folder successfully created', dir)
def save_img(img_urls, data_dir):
i = 0
for src in img_urls:
img_name = '%d.jpg'%(i)
src = 'http:' + src
content = requests.get(src).content
with open(os.path.join(data_dir, img_name), 'wb') as f:
f.write(content)
f.close()
# print('Successful preservation %s'%(os.path.join(data_dir, img_name)))
i += 1
def list_split(lists, n):
"""
数组n等分
:param lists: 需要分割的数组
:param n: n份
:return: res一个包含n等份的数组
"""
res = []
length = len(lists)
for i in range(n):
one_list = lists[math.floor(i / n * length):math.floor((i + 1) / n * length)]
res.append(one_list)
return res
class Spy:
def __init__(self, url, ori_url, thread_n, part=None, name =''):
self.part = part
if part is None:
self.part = ['车身外观']
self.ori_url = ori_url
self.data_dir = name + os.path.split(url)[1]
self.url = url
self.thread_n = thread_n
self.soup = BeautifulSoup(requests.get(url=url).text, 'lxml')
make_dir(self.data_dir)
self.img_urls = []
self.car_type_urls = []
print('Name: %s, Original URL: %s, Fetch From %s, Save as: %s' % (name, self.ori_url, self.url, self.data_dir))
def get_car_type_url(self):
print('Start Get Car Type...')
for div in tqdm(self.soup.find_all('div', {'class':'uibox-con carpic-list02'})):
for li in div.contents[0].contents:
obj = {
'name':'',
'sum':'',
'url':'',
'detail_url': None
}
li = li.contents
obj['url'] = self.ori_url + li[0].get('href')
obj['name'] = li[1].contents[0].contents[0].get('title')
self.car_type_urls.append(obj)
def get_car_detail_url(self):
print('Start Get Detail Information...')
for car_type in tqdm(self.car_type_urls):
for div in pq(url=car_type['url'])('.uibox').items():
flag = False
for a in div('.uibox-title a').items():
if a.text() in self.part:
car_type['detail_url'] = None if a.attr('href') is None else (self.get_detail_arr(self.ori_url + a.attr('href')))
car_type['sum'] = div('.uibox-title .uibox-title-font12').text()
flag = True
break
if flag:
break
print(self.car_type_urls, len(self.car_type_urls))
def download_img(self, car_url_arr):
print('Start Download...')
for car_obj in tqdm(car_url_arr):
img_dir = os.path.join(self.data_dir, car_obj['name']+car_obj['sum'])
make_dir(img_dir)
if car_obj['detail_url'] is None:
continue
img_urls = []
for detail_url in car_obj['detail_url']:
for img in BeautifulSoup(requests.get(url=detail_url).text, 'lxml').find_all('img'):
src = str(img.get('src')).replace('480x360_0_q95_c42_', '1024x0_1_q95_')
if src.find('1024x0_1_q95_') == -1:
continue
img_urls.append(src)
save_img(img_urls, img_dir)
def get_detail_arr(self, url):
urls = []
doc = pq(url)
page = doc('.page')
if page('a'):
urls.append(url)
html = pq(url)
while True:
next_url = html('.page .page-item-next').attr('href')
if next_url == 'javascript:void(0);':
break
else:
urls.append(self.ori_url + next_url)
html = pq(self.ori_url + next_url)
else:
urls.append(url)
return urls
def get_arr_car_type_urls(self):
return list_split(self.car_type_urls, self.thread_n)
if __name__ == '__main__':
ori_url = 'https://car.autohome.com.cn'
car_obj = {
'url': 'https://car.autohome.com.cn/pic/brand-15.html',
'name': '宝马4.0_',
'thread_n': 8
}
s = Spy(car_obj['url'], ori_url, car_obj['thread_n'], car_obj['part'], car_obj['name'])
s.get_car_type_url()
s.get_car_detail_url()
for car_url_arr in s.get_arr_car_type_urls():
thread = threading.Thread(target=s.download_img, args=(car_url_arr, ))
thread.start()
【注】
在使用线程传参时thread = threading.Thread(target=s.download_img, args=(car_url_arr, ))
,target指向执行函数,不需要加上括号,args传入参数(无论是单个参数或是多个参数,末尾都要有个逗号,
),最后执行start函数,线程开始执行。