同步
# import requests
# def parse_page(res):
# print('PARSE %s' %(len(res)))
#
# def get_page(url):
# print('GET %s' %url)
# response=requests.get(url)
# if response.status_code == 200:
# return response.text
#
#
# if __name__ == '__main__':
# urls=[
# 'https://www.baidu.com',
# 'https://www.taobao.com',
# 'https://www.tmall.com',
# ]
# for url in urls:
# res=get_page(url)
# parse_page(res)
#
多线程异步提交任务
from threading import Thread
import requests
def parse_page(res):
print('PARSE %s' %(len(res)))
def get_page(url, callback=parse_page):
print('GET %s' %url)
response=requests.get(url)
if response.status_code == 200:
callback(response.text)
if __name__ == '__main__':
urls=[
'https://www.baidu.com',
'https://www.taobao.com',
'https://www.openstack.org',
]
for url in urls:
# res=get_page(url)
# parse_page(res)
thread = Thread(target=get_page, args=(url, ))
thread.start()
线程池与进程池
# from threading import Thread
import requests
from concurrent.futures import ThreadPoolExecutor
def parse_page(res):
text = res.result()
print('PARSE %s' %(len(text)))
def get_page(url):
print('GET %s' %url)
response=requests.get(url)
if response.status_code == 200:
return response.text
if __name__ == '__main__':
pool = ThreadPoolExecutor(50)
urls=[
'https://www.baidu.com',
'https://www.taobao.com',
'https://www.openstack.org',
]
for url in urls:
# res=get_page(url)
# parse_page(res)
pool.submit(get_page, url).add_done_callback(parse_page)
# thread = Thread(target=get_page, args=(url, ))
# thread.start()
pool.shutdown(wait=True)
开启协程
def parse_page(res):
# res = res.result()
print('%s PARSE %s' %(current_thread().getName(), len(res)))
def get_page(url, callback=parse_page):
print('%s GET %s' % (current_thread().getName(), url))
response=requests.get(url)
if response.status_code == 200:
# return response.text
callback(response.text)
if __name__ == '__main__':
urls=[
'https://www.baidu.com',
'https://www.taobao.com',
'https://www.openstack.org',
]
tasks = []
for url in urls:
tasks.append(gevent.spawn(get_page, url))
gevent.joinall(tasks)
gevent开启协程
from threading import current_thread
# from gevent import monkey;monkey.patch_all()
import gevent
import requests
def parse_page(res):
print('%s PARSE %s' %(current_thread().getName(),len(res)))
def get_page(url,callback=parse_page):
print('%s GET %s' %(current_thread().getName(),url))
response=requests.get(url)
if response.status_code == 200:
callback(response.text)
if __name__ == '__main__':
urls=[
'https://www.baidu.com',
'https://www.taobao.com',
'https://www.openstack.org',
]
tasks=[]
for url in urls:
tasks.append(gevent.spawn(get_page,url))
gevent.joinall(tasks)
asyncio异步模块
''''''
# yield: 可以保存任务的状态后面跟任意类型普通类型 yield from:需要跟迭代器类型
'''
yield:
def y1():
for line in range(5):
yield [line for line in range(5)]
y = y1()
for line in y:
for line2 in line:
print(line2)
yield from :
def y1():
for line in range(5):
yield from [line for line in range(5)]
y = y1()
for line in y:
print(line)
'''
'''
协程:
asyncio模块使用
'''
# 1
'''
from threading import current_thread
import asyncio
@asyncio.coroutine # 翻译: 异步协程 装饰在IO操作的任务
def task(task_id, seconds):
print('%s run %s' %(task_id, seconds))
yield from asyncio.sleep(seconds)
print('%s done' % task_id)
if __name__ == '__main__':
tasks = [
task('任务1', 3),
task('任务2', 2),
task('任务3', 1),
]
# 循环所有IO任务
loop = asyncio.get_event_loop()
# 知道运行完毕
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
'''
# 2
'''
from threading import current_thread
import asyncio
import time
@asyncio.coroutine # 异步协程,装饰在IO任务中
def task(task_id, seconds):
print('%s run %s seconds!' % (task_id, seconds) )
yield from asyncio.sleep(seconds)
print('%s done!' % task_id)
if __name__ == '__main__':
tasks = [
task('任务1', 3),
task('任务2', 2),
task('任务3', 1)
]
# 得到循环事件
pool = asyncio.get_event_loop()
# 直到所有任务执行完毕
pool.run_until_complete(asyncio.wait(tasks))
# 关闭循环事件
pool.close()
'''
# 3
'''
import asyncio
@asyncio.coroutine # 异步协程、装饰在IO任务中
def task(task_id, seconds):
print('%s run %s second!' % (task_id, seconds))
yield from asyncio.sleep(seconds)
print('%s done' % task_id)
if __name__ == '__main__':
tasks = [
task('任务1', 3),
task('任务2', 2),
task('任务3', 2),
]
# 获取循环事件
loop = asyncio.get_event_loop()
# 直到任务完成后
loop.run_until_complete(asyncio.wait(tasks)) # 等待协程任务完成后
# 关闭循环事件
loop.close()
'''
# 4
'''
import asyncio
@asyncio.coroutine # 异步协程,需要装饰在IO任务中
def task(task_id, seconds):
print('%s run %s' % (task_id, seconds))
yield from asyncio.sleep(seconds)
print('%s done!' % task_id)
if __name__ == '__main__':
tasks = [
task('任务1', 3),
task('任务2', 2),
task('任务3', 1),
]
# 获取循环事件
loop = asyncio.get_event_loop()
# 直到任务完成之后
loop.run_until_complete(asyncio.wait(tasks))
# 关闭事件
loop.close()
'''
# 5
'''
import asyncio
@asyncio.coroutine # 异步协程: 装饰在IO任务中
def task(task_id, seconds):
print('%s run %s second' % (task_id, seconds))
yield from asyncio.sleep(seconds)
print('%s done!' % task_id)
if __name__ == '__main__':
tasks = [
task('任务1', 3),
task('任务2', 2),
task('任务3', 1),
]
# 获取循环事件
loop = asyncio.get_event_loop()
# 直到任务完成之后
loop.run_until_complete(asyncio.wait(tasks))
# 开闭循环事件
loop.close()
'''
# ================================= asyncio爬虫使用 =================================
from threading import current_thread
import asyncio
import requests
def parse_page(res):
# res = res.result()
print('%s PARSE %s' %(current_thread().getName(), len(res)))
@asyncio.coroutine # 翻译: 异步协程 装饰在IO操作的任务
def get_page(url, callback=parse_page):
print('%s GET %s' % (current_thread().getName(), url))
# requests模块是不是帮你封装好了tcp协议,你直接考虑http协议是不是就好了
# asyncio模块可不可以直接调requests模块帮你完成tcp协议的链接,直接用http协议
# requests模块用起来方不方便?是不是tcp协议不需要你写啦
response=requests.get(url)
if response.status_code == 200:
callback(response.text)
if __name__ == '__main__':
urls=[
'https://www.baidu.com',
'https://www.taobao.com',
'https://www.openstack.org',
]
tasks = []
for url in urls:
tasks.append(get_page(url))
# 获取循环事件
loop = asyncio.get_event_loop()
# 直到任务完成之后
loop.run_until_complete(asyncio.wait(tasks))
# 开闭循环事件
loop.close()
twisted框架
# getPage: 发送http请求函数 defer: 延迟
from twisted.web.client import getPage, defer
# reactor反应堆
from twisted.internet import reactor
# 解析函数
def parser_page(res):
print('解析结果: %s' % len(res) )
# 需要return结果all_done才会收到结果
return len(res)
# 所有任务执行完的结果都会到这个函数里
def all_done(res):
print(res)
reactor.stop()
# 需要爬取的三个网站
urls = [
'https://www.baidu.com',
'https://www.taobao.com',
'https://www.cnblogs.com',
]
tasks = []
for url in urls:
# url必须传bytes类型
obj = getPage(url.encode('utf-8'))
# obj得到了getPage的结果需要绑定一个回调函数去解析
obj.addCallback(parser_page)
tasks.append(obj)
'''
ps: 对比着tornado来讲,这里twisted给你封装好了计数器
提交一个任务计数就会 +1 ,解析完成后会 -1
直到计数没了之后就会执行all_done函数
'''
# DeferredList把所有任务都提交到循环中检测
# addBoth是等待所有结果完成后执行绑定的回调函数
defer.DeferredList(tasks).addBoth(all_done)
# 执行任务
reactor.run()
# 这样写的话永远都不会执行到stop() 就会卡死在run那里
# 因为等待结果完成后都去了all_done那里了
# reactor.stop()
tornado框架
# 导入tornado异步Http客户端
from tornado.httpclient import AsyncHTTPClient
# 导入tornado Request请求
from tornado.httpclient import HTTPRequest
# 导入tornado循环IO循环事件
from tornado import ioloop
# 得到返回结果回调函数
def handle_response(response):
"""
处理返回值内容(需要维护计数器,来停止IO循环),调用 ioloop.IOLoop.current().stop()
:param response:
:return:
"""
if response.error:
print("Error:", response.error)
else:
print(response.body)
# 此时解析一次结果后就会结束掉
# ioloop.IOLoop.current().stop()
def func():
url_list = [
'http://www.baidu.com',
'http://www.cnblogs.com',
]
for url in url_list:
print(url)
# 得到一个异步请求对象
http_client = AsyncHTTPClient()
# http_client.fetch 给HTTPRequest绑定回调函数的功能
http_client.fetch(HTTPRequest(url), handle_response)
# 产生循环来检测IO,启动循环后立即出发callback函数
# 返回里面的一堆任务的结果
ioloop.IOLoop.current().add_callback(func)
ioloop.IOLoop.current().start()
# 关闭ioloop 还是不会执行到这一行
# ioloop.IOLoop.current().stop()
# 添加计数器
# ================================ tornado计数器需要自己写 ================================
# 导入tornado异步Http客户端
from tornado.httpclient import AsyncHTTPClient
# 导入tornado Request请求
from tornado.httpclient import HTTPRequest
# 导入tornado循环IO循环事件
from tornado import ioloop
# 定义一个计数器
# n = 0
#
# # 得到返回结果回调函数
# def handle_response(response):
# global n
# """
# 处理返回值内容(需要维护计数器,来停止IO循环),调用 ioloop.IOLoop.current().stop()
# :param response:
# :return:
# """
# try:
# if response.error:
# print("Error:", response.error)
# else:
# # 如果抛异常就不会走到结束循环,所以我我们需要try
# raise TypeError
# print(response.body)
#
# except Exception as e:
# print(e)
#
# finally:
# n -= 1
# if n == 0:
#
# # 计数为0的时候计数循环
# ioloop.IOLoop.current().stop()
#
#
# def func():
# global n
# url_list = [
# 'http://www.baidu.com',
# 'http://www.cnblogs.com',
# ]
# for url in url_list:
# print(url)
# # 得到一个异步请求对象
# http_client = AsyncHTTPClient()
# # http_client.fetch 给HTTPRequest绑定回调函数的功能
# http_client.fetch(HTTPRequest(url), handle_response)
# # 提交一个任务计数就 +1
# n += 1
#
#
# # 产生循环来检测IO,启动循环后立即出发callback函数
# # 返回里面的一堆任务的结果
# ioloop.IOLoop.current().add_callback(func)
# ioloop.IOLoop.current().start()
# 关闭ioloop 还是不会执行到这一行
# ioloop.IOLoop.current().stop()