目标:
弄清楚actionrunner的原理
1 总入口
st2/st2actions/st2actions/cmd/actionrunner.py
def main():
try:
_setup()
return _run_worker()
except SystemExit as exit_code:
sys.exit(exit_code)
except:
LOG.exception('(PID=%s) Worker quit due to exception.', os.getpid())
return 1
finally:
_teardown()
def _setup():
common_setup(service='actionrunner', config=config, setup_db=True, register_mq_exchanges=True,
register_signal_handlers=True)
_setup_sigterm_handler()
def _run_worker():
LOG.info('(PID=%s) Worker started.', os.getpid())
components = [
scheduler.get_scheduler(),
worker.get_worker()
]
try:
for component in components:
component.start()
for component in components:
component.wait()
except (KeyboardInterrupt, SystemExit):
LOG.info('(PID=%s) Worker stopped.', os.getpid())
errors = False
for component in components:
try:
component.shutdown()
except:
LOG.exception('Unable to shutdown %s.', component.__class__.__name__)
errors = True
if errors:
return 1
except:
LOG.exception('(PID=%s) Worker unexpectedly stopped.', os.getpid())
return 1
return 0
分析:
1.1)关键代码就是
components = [
scheduler.get_scheduler(),
worker.get_worker()
]
try:
for component in components:
component.start()
对于worker.get_worker()及其start()放在2的分析
对于scheduler.get_scheduler()以及其start()放在3的分析
2 worker.get_worker()及其start()
st2/st2actions/st2actions/worker.py
def get_worker():
with Connection(transport_utils.get_messaging_urls()) as conn:
return ActionExecutionDispatcher(conn, ACTIONRUNNER_QUEUES)
分析:
2.1) 变量打印
(Pdb) p transport_utils.get_messaging_urls()
['amqp://rabbitmq:V6AoJ5TH@rabbitmq.openstack.svc.cluster.local:5672']
ACTIONRUNNER_QUEUES = [
queues.ACTIONRUNNER_WORK_QUEUE,
queues.ACTIONRUNNER_CANCEL_QUEUE,
queues.ACTIONRUNNER_PAUSE_QUEUE,
queues.ACTIONRUNNER_RESUME_QUEUE
]
对应具体内容如下:
# Used by the action runner service
ACTIONRUNNER_WORK_QUEUE = liveaction.get_status_management_queue(
'st2.actionrunner.work', routing_key='scheduled')
ACTIONRUNNER_CANCEL_QUEUE = liveaction.get_status_management_queue(
'st2.actionrunner.cancel', routing_key='canceling')
ACTIONRUNNER_PAUSE_QUEUE = liveaction.get_status_management_queue(
'st2.actionrunner.pause', routing_key='pausing')
ACTIONRUNNER_RESUME_QUEUE = liveaction.get_status_management_queue(
'st2.actionrunner.resume', routing_key='resuming')
def get_status_management_queue(name, routing_key):
return Queue(name, Exchange('st2.liveaction.status', type='topic'), routing_key=routing_key)
分析:
也就是说ActionExecutionDispatcher作为消息处理类,可以监听处理上面4个队列的消息,
exchange经过routing_key绑定到队列关系如下
'st2.liveaction.status'--->'scheduled'--->'st2.actionrunner.work',
'st2.liveaction.status'--->'canceling'--->'st2.actionrunner.cancel',
'st2.liveaction.status'--->'pausing'--->'st2.actionrunner.pause',
'st2.liveaction.status'--->'resuming'--->'st2.actionrunner.resume'
2.2) 分析ActionExecutionDispatcher(conn, ACTIONRUNNER_QUEUES)
class ActionExecutionDispatcher(MessageHandler):
message_type = LiveActionDB
def __init__(self, connection, queues):
super(ActionExecutionDispatcher, self).__init__(connection, queues)
self.container = RunnerContainer()
self._running_liveactions = set()
2.2.1)打印
(Pdb) p connection
<Connection: amqp://rabbitmq:**@rabbitmq.openstack.svc.cluster.local:5672// at 0x4863090>
(Pdb) p queues
[<unbound Queue st2.actionrunner.work -> <unbound Exchange st2.liveaction.status(topic)> -> scheduled>, <unbound Queue st2.actionrunner.cancel -> <unbound Exchange st2.liveaction.status(topic)> -> canceling>, <unbound Queue st2.actionrunner.pause -> <unbound Exchange st2.liveaction.status(topic)> -> pausing>, <unbound Queue st2.actionrunner.resume -> <unbound Exchange st2.liveaction.status(topic)> -> resuming>]
2.2.2)分析ActionExecutionDispatcher处理入口
ActionExecutionDispatcher本身是消息处理类。
class ActionExecutionDispatcher(MessageHandler):
def process(self, liveaction):
"""Dispatches the LiveAction to appropriate action runner.
LiveAction in statuses other than "scheduled" and "canceling" are ignored. If
LiveAction is already canceled and result is empty, the LiveAction
is updated with a generic exception message.
:param liveaction: Action execution request.
:type liveaction: ``st2common.models.db.liveaction.LiveActionDB``
:rtype: ``dict``
"""
if liveaction.status == action_constants.LIVEACTION_STATUS_CANCELED:
LOG.info('%s is not executing %s (id=%s) with "%s" status.',
self.__class__.__name__, type(liveaction), liveaction.id, liveaction.status)
if not liveaction.result:
updated_liveaction = action_utils.update_liveaction_status(
status=liveaction.status,
result={'message': 'Action execution canceled by user.'},
liveaction_id=liveaction.id)
executions.update_execution(updated_liveaction)
return
if liveaction.status not in ACTIONRUNNER_DISPATCHABLE_STATES:
LOG.info('%s is not dispatching %s (id=%s) with "%s" status.',
self.__class__.__name__, type(liveaction), liveaction.id, liveaction.status)
return
try:
liveaction_db = action_utils.get_liveaction_by_id(liveaction.id)
except StackStormDBObjectNotFoundError:
LOG.exception('Failed to find liveaction %s in the database.', liveaction.id)
raise
if liveaction.status != liveaction_db.status:
LOG.warning(
'The status of liveaction %s has changed from %s to %s '
'while in the queue waiting for processing.',
liveaction.id,
liveaction.status,
liveaction_db.status
)
dispatchers = {
action_constants.LIVEACTION_STATUS_SCHEDULED: self._run_action,
action_constants.LIVEACTION_STATUS_CANCELING: self._cancel_action,
action_constants.LIVEACTION_STATUS_PAUSING: self._pause_action,
action_constants.LIVEACTION_STATUS_RESUMING: self._resume_action
}
return dispatchers[liveaction.status](liveaction)
分析:
2.2.2.1) 打印变量
(Pdb) p liveaction
<LiveActionDB: LiveActionDB(action="email.mistral-network-check", action_is_workflow=True, callback={}, context={u'trigger_instance': {u'id': u'5e7b88d8b4a73700014aa920', u'name': None}, u'trace_context': {u'id_': u'5e7b88d8b4a73700014aa921', u'trace_tag': u'st2.IntervalTimer-8c13d9a3-12a3-4cff-88d3-66c804a47590'}, u'rule': {u'id': u'5e660433c75b50001c06b822', u'name': u'network-check'}, u'user': u'admin@example.org'}, end_timestamp=None, id=5e7b88d9b4a73700014aa922, notify=None, parameters={u'cmd': u'curl busybox:80/cmd/network/check.sh%20cn', u'email_to': [u'chao.ma@easystack.cn'], u'email_from': u'noreply@easystack.cn'}, result={}, runner_info={}, start_timestamp="2020-03-25 16:37:45.132682+00:00", status="scheduled")>
2.2.2.2)逻辑分析
这里实际上是根据liveaction的不同状态分别调用不同的处理方法,
例如如果状态是scheduled,就会调用ActionExecutionDispatcher类的_run_action方法。
重点分析_run_action方法,具体参见2.3)的分析
2.3) 分析ActionExecutionDispatcher类的_run_action方法
class ActionExecutionDispatcher(MessageHandler):
def _run_action(self, liveaction_db):
# stamp liveaction with process_info
runner_info = system_info.get_process_info()
# Update liveaction status to "running"
liveaction_db = action_utils.update_liveaction_status(
status=action_constants.LIVEACTION_STATUS_RUNNING,
runner_info=runner_info,
liveaction_id=liveaction_db.id)
self._running_liveactions.add(liveaction_db.id)
action_execution_db = executions.update_execution(liveaction_db)
# Launch action
extra = {'action_execution_db': action_execution_db, 'liveaction_db': liveaction_db}
LOG.audit('Launching action execution.', extra=extra)
# the extra field will not be shown in non-audit logs so temporarily log at info.
LOG.info('Dispatched {~}action_execution: %s / {~}live_action: %s with "%s" status.',
action_execution_db.id, liveaction_db.id, liveaction_db.status)
extra = {'liveaction_db': liveaction_db}
try:
result = self.container.dispatch(liveaction_db)
LOG.debug('Runner dispatch produced result: %s', result)
if not result:
raise ActionRunnerException('Failed to execute action.')
except:
_, ex, tb = sys.exc_info()
extra['error'] = str(ex)
LOG.info('Action "%s" failed: %s' % (liveaction_db.action, str(ex)), extra=extra)
liveaction_db = action_utils.update_liveaction_status(
status=action_constants.LIVEACTION_STATUS_FAILED,
liveaction_id=liveaction_db.id,
result={'error': str(ex), 'traceback': ''.join(traceback.format_tb(tb, 20))})
executions.update_execution(liveaction_db)
raise
finally:
# In the case of worker shutdown, the items are removed from _running_liveactions.
# As the subprocesses for action executions are terminated, this finally block
# will be executed. Set remove will result in KeyError if item no longer exists.
# Use set discard to not raise the KeyError.
self._running_liveactions.discard(liveaction_db.id)
return result
分析:
2.3.1) 打印变量
(Pdb) p liveaction_db
<LiveActionDB: LiveActionDB(action="email.mistral-network-check", action_is_workflow=True, callback={}, context={u'trigger_instance': {u'id': u'5e7b88d8b