首先用Flask搭建一个本地的页面:
from flask import Flask import time app = Flask(__name__) @app.route('/') def index(): time.sleep(3) return 'Hello!' if __name__ == '__main__': app.run(threaded=True,port=5000)
单线程爬取:
import time import requests total = 100 def request(): url = 'http://127.0.0.1:5000' r = requests.get(url,timeout=10) if __name__ == '__main__': startTime = time.time() for i in range(0,total): request() print('爬取第{}个网页'.format(i)) endTime = time.time() print('爬取{}个网页,总花费时间{}'.format(total,endTime-startTime)) """爬取100个网页,总花费时间303.0711946487427"""
多线程爬取:
import time import requests from multiprocessing.dummy import Pool as ThreadPool total = 100 thread = 4 def request(): url = 'http://127.0.0.1:5000' r = requests.get(url, timeout=10) def run(i): for _ in range(0,total//thread): request() if __name__ == '__main__': startTime = time.time() i = [_ for _ in range(0,thread)] pool = ThreadPool(thread) pool.map(run,i) pool.close() pool.join() endTime = time.time() print('爬取{}个网页,总花费时间{}'.format(total,endTime-startTime)) """爬取100个网页,总花费时间76.17911839485168"""
线程池爬取:
import time import requests from concurrent.futures import ThreadPoolExecutor total = 100 thread = 4 def request(i): url = 'http://127.0.0.1:5000' r = requests.get(url, timeout=10) print('这是第{}个页面------------'.format(i)) if __name__ == '__main__': startTime = time.time() urls = [i for i in range(1, 100)] with ThreadPoolExecutor(4) as executor: executor.map(request,urls) endTime = time.time() print('爬取{}个网页,总花费时间{}'.format(total,endTime-startTime)) """爬取100个网页,总花费时间75.96918320655823"""
多进程爬取:
import time from multiprocessing import Pool import requests total = 100 def request(i): url = 'http://127.0.0.1:5000' r = requests.get(url,timeout=10) print('正在爬取第{}个网页'.format(i)) if __name__ == '__main__': startTime = time.time() pool = Pool(processes=8) urls = [i for i in range(0,100)] pool.map(request, urls) pool.close() pool.join() endTime = time.time() print('爬取{}个网页,总花费时间{}'.format(total,endTime-startTime)) """爬取100个网页,总花费时间86.5307264328003"""
单线程+协程爬取:
import time import requests import asyncio total = 100 async def request(): url = 'http://127.0.0.1:5000' future = loop.run_in_executor(None,requests.get,url) response = await future if __name__ == '__main__': startTime = time.time() tasks = [asyncio.ensure_future(request()) for i in range(0,total)] loop = asyncio.get_event_loop() loop.run_until_complete(asyncio.wait(tasks)) loop.close() endTime = time.time() print('爬取{}个网页,总花费时间{}'.format(total,endTime-startTime)) """爬取100个网页,总花费时间17.071962594985962"""
多线程+协程爬取:
import time import requests from multiprocessing.dummy import Pool as ThreadPool import asyncio total = 100 thread = 4 async def request(loop): url = 'http://127.0.0.1:5000' print('正在爬取第{}个网页'.format(1)) future = loop.run_in_executor(None, requests.get, url) response = await future def run(i): import asyncio loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) tasks = [asyncio.ensure_future(request(loop)) for i in range(0,total//thread)] loop.run_until_complete(asyncio.wait(tasks)) loop.close() if __name__ == '__main__': startTime = time.time() pool = ThreadPool(4) i = [j for j in range(0,thread)] pool.map(run,i) pool.close() pool.join() endTime = time.time() print('爬取{}个网页,总花费时间{}'.format(total,endTime-startTime)) """爬取100个网页,总花费时间8.90938687324524"""