主要看最后的demo类
链接数据库请参照connect_db,不要忘记关闭连接
明天会梳理一下这个小框架
写一些demo,现在先站一些坑
import asyncio
from threading import Thread
import logging.config
import aiomysql
import aiohttp
class Request:
def __init__(self, url, *, callback=None, session_setting=None, **kwargs):
self.url = url
self.callback = callback
self.session_setting = session_setting or {}
self.meta = kwargs.pop('meta', {})
self.method = kwargs.pop('method', 'get')
self.method_setting = kwargs
class Response:
def __init__(self, resp, request):
object.__setattr__(self, 'request', request)
object.__setattr__(self, 'resp', resp)
def __getattr__(self, item):
if item == "request":
return self.request
else:
return getattr(self.resp, item)
def __setattr__(self, key, value):
print(key, value)
if key == "request":
self.__dict__[key] = value
else:
self.resp.__dict__[key] = value
def __delattr__(self, item):
if item == "request":
delattr(self, 'request')
else:
return delattr(self.resp, item)
config = {
'version': 1,
'formatters': {
'simple': {
'format': '%(asctime)s - %(thread)d - %(filename)s %(lineno)d - %(levelname)s - %(message)s',
},
# 其他的 formatter
},
'handlers': {
'console': {
'class': 'logging.StreamHandler',
'level': 'DEBUG',
'formatter': 'simple'
},
# 'file': {
# 'class': 'logging.FileHandler',
# 'filename': 'logging.log',
# 'level': 'DEBUG',
# 'formatter': 'simple'
# },
# 其他的 handler
},
'loggers': {
'StreamLogger': {
'handlers': ['console'],
'level': 'DEBUG',
},
# 'FileLogger': {
# # 既有 console Handler,还有 file Handler
# 'handlers': ['console', 'file'],
# 'level': 'DEBUG',
# },
# 其他的 Logger
}
}
logging.config.dictConfig(config)
stream_logger = logging.getLogger("StreamLogger")
# FileLogger = logging.getLogger("FileLogger")
# 省略日志输出
class Spider:
def __init__(self, *args):
assert args, '请传入起始url'
# 起始请求
self.start_requests = list(
map(lambda x: Request(x) if isinstance(x, str) else x, args)
)
# logging
self.log = stream_logger
# 事件循环
self.new_loop = asyncio.new_event_loop()
# 时间循环所开辟的线程
self.thread = Thread(target=self.start_loop, )
# 请求future的结果
self.results = []
# 被卡主的session,为了防止session关闭而导致请求失败,以及复用session
self.sessions = []
# 任务数
self._count = len(args)
# 完成任务数
self._finish_count = 0
# 数据库连接
self.db = None
# 数据库开始和结束的返回结果
self.db_results = []
@property
def start_urls(self):
return [i.url for i in self.start_requests]
async def connect_db(self):
try:
self.log.info("正在连接数据库")
self.db = await aiomysql.create_pool(
host='127.0.0.1',
port=3306,
user='root',
password='1234',
db='mysql',
loop=None,
autocommit=False
)
self.log.info("数据库链接成功")
except Exception as e:
self.db = None
self.log.error("数据库连接失败", e)
# self.db.close()
# await self.db.wait_closed()
def close_db(self):
result = asyncio.run_coroutine_threadsafe(
self.wait_closed(),
self.new_loop
)
self.db_results.append(result)
result.add_done_callback(self.stop_loop)
async def wait_closed(self):
if self.db:
try:
self.log.info("关闭数据库链接")
self.db.close()
await self.db.wait_closed()
self.log.info("数据库链接关闭成功")
except Exception as e:
self.log.error("数据库关闭失败", e)
def start_thread(self):
self.log.info("开始运行")
self.thread.setDaemon(True)
self.thread.start()
def start_loop(self):
asyncio.set_event_loop(self.new_loop)
self.new_loop.run_forever()
def run(self):
self.start_thread()
db_result = asyncio.run_coroutine_threadsafe(
self.connect_db(),
self.new_loop
)
self.db_results.append(db_result)
for request in self.start_requests:
self.add_loop(request)
self.join()
def add_loop(self, request, sess=None, count=True, event="request"):
if not count:
self._count += 1
if event == "request":
request.callback = request.callback or self.parse
result = asyncio.run_coroutine_threadsafe(
self.fetch(request, sess, first=count),
self.new_loop
)
else:
result = asyncio.run_coroutine_threadsafe(
self.data_handle(request),
self.new_loop
)
# 第一次
if count:
result.add_done_callback(self.stop_first)
else:
result.add_done_callback(self.stop)
self.results.append(result)
def stop_first(self, f):
self._finish_count += 1
self.log.info("已完成%d任务,总共%d任务,占据%.2f%%" %
(
self._finish_count,
self._count,
self._finish_count / self._count * 100,
)
)
if self._count == self._finish_count:
# self.stop_loop()
self.close_db()
def stop(self, f):
self._finish_count += 1
self.log.info("已完成%d任务,总共%d任务,占据%.2f%%" %
(
self._finish_count,
self._count,
self._finish_count / self._count * 100,
)
)
if f.exception():
self.log.error(f.exception())
self.results.remove(f)
self.set_session_result()
def set_session_result(self, ):
# len(self.start_urls) + self._finish_count >= self._count
# print(
# len(self.results),
# len(self.start_urls),
# self._finish_count,
# self._count
# )
if len(self.results) == len(self.start_urls):
for f in self.sessions:
if not f.done():
f.set_result(None)
def stop_loop(self, f):
self.new_loop.stop()
def join(self):
self.thread.join()
self.close_loop()
for f in self.results:
f._done_callbacks.remove(self.stop_first)
self.log.info("事件循环结束")
def close_loop(self):
self.new_loop.close()
def add_event(self, prev_request, prev_event, sess=None):
if prev_request:
if prev_event == 'request':
self.add_loop(prev_request, count=False, sess=sess, event="request")
else:
self.add_loop(prev_request, count=False, event="data_handle")
async def fetch(self, request, sess=None, first=True):
if first:
sess = aiohttp.ClientSession(**request.session_setting)
method_func = getattr(sess, request.method.lower())
async with method_func(request.url, **request.method_setting) as resp:
response = Response(resp, request)
# 为了由于导致下个任务还没开始时,上一个任务完成导致session关闭
# 所以采用了下一个事件调用上一次的事件的方法(本次事件延迟到下次调用),防止session的提前关闭
prev_request = prev_event = None
async for parse_result in request.callback(response):
if isinstance(parse_result, Request):
self.add_event(prev_request, prev_event, sess=sess)
prev_request, prev_event = parse_result, 'request'
elif isinstance(parse_result, dict):
self.add_event(prev_request, prev_event, sess=sess)
prev_request, prev_event = parse_result, 'data_handle'
else:
self.log.error("严重错误,"
"回调函数必须返回Request或者dict!!!")
raise ("必须返回Request或者dict")
else:
# 因为最后一个事件没有调用,所以循环完强制调用
self.add_event(prev_request, prev_event, sess=sess)
if first:
# 防止sess关闭,一个永远也回不来的请求,只有当事件都完成后才会结束
close_session = asyncio.Future()
self.sessions.append(close_session)
await close_session
await sess.close()
async def data_handle(self, data):
return data
async def parse(self, resp):
yield None
if __name__ == '__main__':
# python 3.6.5
class Demo(Spider):
async def data_handle(self, data):
print(data)
await asyncio.sleep(3)
1 / 0
return data
async def parse(self, resp):
yield Request('http://www.baidu.com', callback=self.parse2)
await asyncio.sleep(3)
yield Request('http://www.baidu.com', callback=self.parse3)
print("shui 3 ")
await asyncio.sleep(2)
yield Request('http://www.baidu.com', callback=self.parse3)
async def parse2(self, resp):
yield Request('http://www.baidu.com', callback=self.parse4)
await asyncio.sleep(6)
yield Request('http://www.baidu.com', callback=self.parse3)
await asyncio.sleep(10)
yield Request('http://www.baidu.com', callback=self.parse3)
async def parse3(self, resp):
await resp.text()
print("解析333")
yield {'1': 1}
async def parse4(self, resp):
yield Request('http://www.baidu.com', callback=self.parse3)
await asyncio.sleep(10)
yield Request('http://www.baidu.com', callback=self.parse3)
await asyncio.sleep(2)
yield Request('http://www.baidu.com', callback=self.parse3)
spider = Demo('http://www.baidu.com', )
spider.run()
print(spider.results)
for i in spider.results:
print(i.exception())
print(spider.db_results)
print(spider.sessions)