分别使用了多线程 多进程 gevent测试爬虫.
from gevent import monkey
monkey.patch_all()
from os import (
path,
makedirs
)
from time import time
from threading import Thread
from multiprocessing import Pool as MPool
from gevent.pool import Pool
import requests
from bs4 import BeautifulSoup
from functools import wraps
def timer(fun):
@wraps(fun)
def wrapper(*args, **kwargs):
self = args[0]
start_at = time()
f = fun(*args, **kwargs)
use_dt = int(time() - start_at)
print("一共耗时{}秒".format(use_dt))
print("一共下载{}张照片".format(self.count))
return f
return wrapper
class DownLoadBaiDu(object):
def __init__(self, keyword, full=False, start: int = 0, stop: int = 1):
self.full = full
self.search_url = "https://tieba.baidu.com/f?ie=utf-8&kw={}&fr=search".format(keyword)
self.start = start
self.stop = stop
self.count = 0
@property
def image_path(self):
img_path = path.join(path.dirname(__file__), "img")
if not path.exists(img_path):
makedirs(img_path)
return img_path
def save_jpg(self, url):
print("url:{}".format(url))
_jpg = path.join(self.image_path, path.split(url)[1])
jpg_content = requests.get(url).content
with open("{}".format(_jpg), "wb") as f:
f.write(jpg_content)
self.count += 1
def download_img(self, uri):
url = "https://tieba.baidu.com/{}".format(uri)
html = requests.get(url).text
print("\n帖子链接:{}\n".format(url))
soup = BeautifulSoup(html)
imgs = soup.find_all("img")
for img in imgs:
_img = img.attrs["src"]
if _img.startswith("http") and _img.endswith("jpg"):
self.save_jpg(url=_img)
def get_url_number(self):
"""获取搜索结果的id"""
numbers = []
h = requests.get(self.search_url).text
soup = BeautifulSoup(h)
ds = soup.find_all(attrs={"class": "j_th_tit"})
print("一共获取到{}个帖子".format(len(ds)))
if not self.full:
ds = ds[self.start: self.stop]
for index, i in enumerate(ds):
if len(i) == 1:
uri = i.attrs["href"]
else:
uri = i.find("a").attrs["href"]
numbers.append(uri)
return numbers
@timer
def multipr_down(self):
"""多进程,跑多进程时注释掉gevent的相关import"""
urls = self.get_url_number()
p = MPool()
for url in urls:
p.apply_async(self.download_img, args=(url,))
p.close()
p.join()
@timer
def thread_down(self):
"""多线程"""
urls = self.get_url_number()
ts = []
for url in urls:
t = Thread(target=self.download_img, args=[url])
ts.append(t)
for i in ts:
i.start()
for i in ts:
i.join()
@timer
def gevent_down(self):
"""gevent处理"""
urls = self.get_url_number()
pool = Pool(len(urls))
pool.map(self.download_img, urls)
@timer
def get(self):
"""单进程"""
numbers = self.get_url_number()
for index, number in enumerate(numbers):
print("正在下载第{}个帖子的图片".format(index + 1))
self.download_img(number)
if __name__ == '__main__':
d = DownLoadBaiDu(keyword="美女", full=True)
d.gevent_down()
"""
单进程:
一共耗时155秒
一共下载718张照片
多线程:
一共耗时34秒
一共下载866张照片
Gevent:
一共耗时30秒
一共下载770张照片
"""