[SaltStack] salt-minion启动流程

SaltStack源码阅读

前面理了下salt-master的启动流程, 这次来看看salt-minion的启动流程.

启动salt-minion方法:

/etc/init.d/salt-minion start
    

看看/etc/init.d/salt-master逻辑:

$ cat /etc/init.d/salt-master

SALTMINION=/usr/bin/salt-minion
PYTHON=/usr/bin/python
MINION_ARGS=""
    
start() {
    echo -n $"Starting salt-minion daemon: "
    if [ -f $SUSE_RELEASE ]; then
        startproc -f -p /var/run/$SERVICE.pid $SALTMINION -d $MINION_ARGS
        rc_status -v
    elif [ -e $DEBIAN_VERSION ]; then
        if [ -f $LOCKFILE ]; then
            echo -n "already started, lock file found"
            RETVAL=1
        elif $PYTHON $SALTMINION -d $MINION_ARGS >& /dev/null; then
            echo -n "OK"
            RETVAL=0
        fi
    else
        if [[ ! -z "$(pidofproc -p /var/run/$SERVICE.pid $PROCESS)" ]]; then
            RETVAL=$?
            echo -n "already running"
        else
            daemon --check $SERVICE $SALTMINION -d $MINION_ARGS
        fi
    fi
    RETVAL=$?
    echo
    return $RETVAL
}       

继续看看/usr/bin/salt-minion:

$ cat /usr/bin/salt-master

#!/usr/bin/python
'''
This script is used to kick off a salt minion daemon
'''

from salt.scripts import salt_minion
from multiprocessing import freeze_support


if __name__ == '__main__':
    # This handles the bootstrapping code that is included with frozen
    # scripts. It is a no-op on unfrozen code.
    freeze_support()
    salt_minion()

调用salt_minion()方法, 在script.py里:

$ cat scripts.py

def salt_minion():
    '''
    Start the salt minion.
    '''
    import salt.cli.daemons
    import multiprocessing
    if '' in sys.path:
        sys.path.remove('')

    if salt.utils.is_windows():
        minion = salt.cli.daemons.Minion()
        minion.start()
        return

    if '--disable-keepalive' in sys.argv:
        sys.argv.remove('--disable-keepalive')
        minion = salt.cli.daemons.Minion()
        minion.start()
        return

    # keep one minion subprocess running
    while True:
        try:
            queue = multiprocessing.Queue()
        except Exception:
            # This breaks in containers
            minion = salt.cli.daemons.Minion()
            minion.start()
            return
        process = multiprocessing.Process(target=minion_process, args=(queue,))
        process.start()
        try:
            process.join()
            try:
                restart_delay = queue.get(block=False)
            except Exception:
                if process.exitcode == 0:
                    # Minion process ended naturally, Ctrl+C or --version
                    break
                restart_delay = 60
            if restart_delay == 0:
                # Minion process ended naturally, Ctrl+C, --version, etc.
                break
            # delay restart to reduce flooding and allow network resources to close
            time.sleep(restart_delay)
        except KeyboardInterrupt:
            break
        # need to reset logging because new minion objects
        # cause extra log handlers to accumulate
        rlogger = logging.getLogger()
        for handler in rlogger.handlers:
            rlogger.removeHandler(handler)
        logging.basicConfig()

这里启动minion使用了multiprocessing.Process方法, target是minion_process函数, 主要流程是:

def minion_process(queue):
    '''
    Start a minion process
    '''
    import salt.cli.daemons
    # salt_minion spawns this function in a new process

    def suicide_when_without_parent(parent_pid):
        '''
        Have the minion suicide if the parent process is gone

        NOTE: there is a small race issue where the parent PID could be replace
        with another process with the same PID!
        '''
        while True:
            time.sleep(5)
            try:
                # check pid alive (Unix only trick!)
                os.kill(parent_pid, 0)
            except OSError:
                # forcibly exit, regular sys.exit raises an exception-- which
                # isn't sufficient in a thread
                os._exit(999)
    if not salt.utils.is_windows():
        thread = threading.Thread(target=suicide_when_without_parent, args=(os.getppid(),))
        thread.start()

    restart = False
    minion = None
    try:
        minion = salt.cli.daemons.Minion()
        minion.start()
    except (Exception, SaltClientError, SaltReqTimeoutError, SaltSystemExit) as exc:
        log.error('Minion failed to start: ', exc_info=True)
        restart = True
    except SystemExit as exc:
        restart = False

    if restart is True:
        log.warn('** Restarting minion **')
        delay = 60
        if minion is not None:
            if hasattr(minion, 'config'):
                delay = minion.config.get('random_reauth_delay', 60)
        random_delay = randint(1, delay)
        log.info('Sleeping random_reauth_delay of {0} seconds'.format(random_delay))
        # preform delay after minion resources have been cleaned
        queue.put(random_delay)
    else:
        queue.put(0)

这里调用了salt.cli.daemons.Minion的start方法, 目标类文件在: ~/salt/cli/daemons.py

class Minion(parsers.MinionOptionParser):
    '''
    Create a minion server
    '''
    
    def prepare(self):
        '''
        Run the preparation sequence required to start a salt minion.

        If sub-classed, don't **ever** forget to run:

        super(YourSubClass, self).prepare()
        '''
        self.parse_args()

        try:
            if self.config['verify_env']:
                confd = self.config.get('default_include')
                if confd:
                    # If 'default_include' is specified in config, then use it
                    if '*' in confd:
                        # Value is of the form "minion.d/*.conf"
                        confd = os.path.dirname(confd)
                    if not os.path.isabs(confd):
                        # If configured 'default_include' is not an absolute
                        # path, consider it relative to folder of 'conf_file'
                        # (/etc/salt by default)
                        confd = os.path.join(
                            os.path.dirname(self.config['conf_file']), confd
                        )
                else:
                    confd = os.path.join(
                        os.path.dirname(self.config['conf_file']), 'minion.d'
                    )
                v_dirs = [
                        self.config['pki_dir'],
                        self.config['cachedir'],
                        self.config['sock_dir'],
                        self.config['extension_modules'],
                        confd,
                    ]
                if self.config.get('transport') == 'raet':
                    v_dirs.append(os.path.join(self.config['pki_dir'], 'accepted'))
                    v_dirs.append(os.path.join(self.config['pki_dir'], 'pending'))
                    v_dirs.append(os.path.join(self.config['pki_dir'], 'rejected'))
                    v_dirs.append(os.path.join(self.config['cachedir'], 'raet'))
                verify_env(
                    v_dirs,
                    self.config['user'],
                    permissive=self.config['permissive_pki_access'],
                    pki_dir=self.config['pki_dir'],
                )
                logfile = self.config['log_file']
                if logfile is not None and not logfile.startswith(('tcp://',
                                                            'udp://',
                                                            'file://')):
                    # Logfile is not using Syslog, verify
                    current_umask = os.umask(0o027)
                    verify_files([logfile], self.config['user'])
                    os.umask(current_umask)
        except OSError as err:
            logger.exception('Failed to prepare salt environment')
            sys.exit(err.errno)

        self.setup_logfile_logger()
        logger.info(
            'Setting up the Salt Minion "{0}"'.format(
                self.config['id']
            )
        )
        migrations.migrate_paths(self.config)
        # TODO: AIO core is separate from transport
        if self.config['transport'].lower() in ('zeromq', 'tcp'):
            # Late import so logging works correctly
            import salt.minion
            # If the minion key has not been accepted, then Salt enters a loop
            # waiting for it, if we daemonize later then the minion could halt
            # the boot process waiting for a key to be accepted on the master.
            # This is the latest safe place to daemonize
            self.daemonize_if_required()
            self.set_pidfile()
            if isinstance(self.config.get('master'), list):
                if self.config.get('master_type') == 'failover':
                    self.minion = salt.minion.Minion(self.config)
                else:
                    self.minion = salt.minion.MultiMinion(self.config)
            else:
                self.minion = salt.minion.Minion(self.config)
        else:
            import salt.daemons.flo
            self.daemonize_if_required()
            self.set_pidfile()
            self.minion = salt.daemons.flo.IofloMinion(self.config)

def start(self):
    '''
    Start the actual minion.

    If sub-classed, don't **ever** forget to run:

        super(YourSubClass, self).start()

    NOTE: Run any required code before calling `super()`.
    '''
    try:
        self.prepare()
        if check_user(self.config['user']):
            logger.info('The salt minion is starting up')
            self.minion.tune_in() #建立publisher
    except (KeyboardInterrupt, SaltSystemExit) as exc:
        logger.warn('Stopping the Salt Minion')
        if isinstance(exc, KeyboardInterrupt):
            logger.warn('Exiting on Ctrl-c')
        else:
            logger.error(str(exc))
    finally:
        self.shutdown()

在这里minion使用prepare方法准备salt minion参数和环境配置, 通过self.minion.tune_in()尝试建立publisher, 和master建立通信机制, 目标文件在: ~/salt/minion.py

class MultiMinion(MinionBase):
    '''
    如果minion-conf里配置了多minion, 则会使用MultiMinion来启动minion
    Create a multi minion interface, this creates as many minions as are
    defined in the master option and binds each minion object to a respective
    master.
    '''

    # Multi Master Tune In
    def tune_in(self):
        '''
        Bind to the masters

        This loop will attempt to create connections to masters it hasn't connected
        to yet, but once the initial connection is made it is up to ZMQ to do the
        reconnect (don't know of an API to get the state here in salt)
        '''
        # Fire off all the minion coroutines
        self.minions = self._spawn_minions()

        # serve forever!
        self.io_loop.start()
    

这里tune_in()是关键的步骤, minion链接master, 维护minion Pub/Sub socket通信等等.

先看看tune_in()方法:

# Multi Master Tune In
def tune_in(self):
    '''
    Bind to the masters

    This loop will attempt to create connections to masters it hasn't connected
    to yet, but once the initial connection is made it is up to ZMQ to do the
    reconnect (don't know of an API to get the state here in salt)
    '''
    # Fire off all the minion coroutines
    self.minions = self._spawn_minions()

    # serve forever!
    self.io_loop.start() #启动io_loop异步事件驱动
    
def _spawn_minions(self):
    '''
    Spawn all the coroutines which will sign in to masters
    '''
    if not isinstance(self.opts['master'], list):
        log.error(
            'Attempting to start a multimaster system with one master')
        sys.exit(salt.defaults.exitcodes.EX_GENERIC)
    for master in set(self.opts['master']):
        s_opts = copy.deepcopy(self.opts)
        s_opts['master'] = master
        s_opts['multimaster'] = True
        s_opts['auth_timeout'] = self.MINION_CONNECT_TIMEOUT
        self.io_loop.spawn_callback(self._connect_minion, s_opts) #callback调用_connect_minion


def _connect_minion(self, opts):
    '''
    Create a minion, and asynchronously connect it to a master
    '''
    last = 0  # never have we signed in
    auth_wait = opts['acceptance_wait_time']
    while True:
        try:
            minion = Minion(opts,
                            self.MINION_CONNECT_TIMEOUT,
                            False,
                            io_loop=self.io_loop,
                            loaded_base_name='salt.loader.{0}'.format(opts['master']),
                            )
            yield minion.connect_master() 
            minion.tune_in(start=False)
            break
        except SaltClientError as exc:
            log.error('Error while bringing up minion for multi-master. Is master at {0} responding?'.format(opts['master']))
            last = time.time()
            if auth_wait < self.max_auth_wait:
                auth_wait += self.auth_wait
            yield tornado.gen.sleep(auth_wait)  # TODO: log?
        except Exception as e:
            log.critical('Unexpected error while connecting to {0}'.format(opts['master']), exc_info=True)

到这里大家会发现tune_in进入循环了... 没错! minion server进程会无限循环下去, 维护minion Pub/Sub socket, 保持和salt master的链接, 监听event事件完成Job工作等等.

接下来看看connect_master()方法:

def connect_master(self):
    '''
    Return a future which will complete when you are connected to a master
    '''
    master, self.pub_channel = yield self.eval_master(self.opts, self.timeout, self.safe)
    
    yield self._post_master_init(master)

eval_master方法计算并返回master地址以及生成pub_channel;

_post_master_init方法是在链接完成后进行初始化;

在eval_master里, 如果master_type是func, 则会load所有models; 如果master_type是failover, 则会获取master address list, 进而会调用salt.transport.client.AsyncPubChannel.factory()生成pub_channel, 调用connect()方法链接master主机, 具体逻辑是下面这样的:

def eval_master(self,
                opts,
                timeout=60,
                safe=True,
                failed=False):

    # check if master_type was altered from its default
    if opts['master_type'] != 'str':
        # check for a valid keyword
        if opts['master_type'] == 'func': #func类型
            # split module and function and try loading the module
            mod, fun = opts['master'].split('.')
            try:
                master_mod = salt.loader.raw_mod(opts, mod, fun) #load各个modules
                if not master_mod:
                    raise TypeError
                # we take whatever the module returns as master address
                opts['master'] = master_mod[mod + '.' + fun]() #return master地址
            except TypeError:
                msg = ('Failed to evaluate master address from '
                       'module \'{0}\''.format(opts['master']))
                log.error(msg)
                sys.exit(salt.defaults.exitcodes.EX_GENERIC)
            log.info('Evaluated master from module: {0}'.format(master_mod))

        # if failover is set, master has to be of type list
        elif opts['master_type'] == 'failover': #failover类型
            if isinstance(opts['master'], list):
            
                '''
                使用isinstance方法, failover链接master
                '''
                log.info('Got list of available master addresses:'
                         ' {0}'.format(opts['master']))
                if opts['master_shuffle']:
                '''
                If master is a list of addresses, shuffle them before trying to connect to distribute the minions over all available masters.
                '''
                    shuffle(opts['master'])
            # if opts['master'] is a str and we have never created opts['master_list']
            elif isinstance(opts['master'], str) and ('master_list' not in opts):
                # We have a string, but a list was what was intended. Convert.
                # See issue 23611 for details
                opts['master'] = list(opts['master'])
            elif opts['__role'] == 'syndic':#二级master
                log.info('Syndic setting master_syndic to \'{0}\''.format(opts['master']))

            # if failed=True, the minion was previously connected
            # we're probably called from the minions main-event-loop
            # because a master connection loss was detected. remove
            # the possibly failed master from the list of masters.
            elif failed:
                log.info('Removing possibly failed master {0} from list of'
                         ' masters'.format(opts['master']))
                # create new list of master with the possibly failed one removed
                opts['master'] = [x for x in opts['master_list'] if opts['master'] != x]

            else:
                msg = ('master_type set to \'failover\' but \'master\' '
                       'is not of type list but of type '
                       '{0}'.format(type(opts['master'])))
                log.error(msg)
                sys.exit(salt.defaults.exitcodes.EX_GENERIC)
        else:
            msg = ('Invalid keyword \'{0}\' for variable '
                   '\'master_type\''.format(opts['master_type']))
            log.error(msg)
            sys.exit(salt.defaults.exitcodes.EX_GENERIC)

    # if we have a list of masters, loop through them and be
    # happy with the first one that allows us to connect
    if isinstance(opts['master'], list):
        conn = False
        # shuffle the masters and then loop through them
        local_masters = copy.copy(opts['master'])

        for master in local_masters:
            opts['master'] = master
            opts.update(resolve_dns(opts))
            super(Minion, self).__init__(opts)  # TODO: only run init once?? This will run once per attempt

            # on first run, update self.opts with the whole master list
            # to enable a minion to re-use old masters if they get fixed
            if 'master_list' not in opts:
                opts['master_list'] = local_masters

            try:
                pub_channel = salt.transport.client.AsyncPubChannel.factory(opts,
                                        timeout=timeout,                                                                                                                            safe=safe,
                                        io_loop=self.io_loop,
                                        )
                yield pub_channel.connect()
                conn = True
                break
            except SaltClientError:
            
                '''
                链接master失败, 尝试下一个master链接
                '''
                msg = ('Master {0} could not be reached, trying '
                       'next master (if any)'.format(opts['master']))
                log.info(msg)
                continue

        if not conn:
            self.connected = False
            msg = ('No master could be reached or all masters denied '
                   'the minions connection attempt.')
            log.error(msg)
        else:
            self.connected = True
            raise tornado.gen.Return((opts['master'], pub_channel))

    # single master sign in
    else:
        opts.update(resolve_dns(opts)) #处理dns解析master ip address
        pub_channel = salt.transport.client.AsyncPubChannel.factory(self.opts,
                                                                    timeout=timeout,
                                                                    safe=safe,
                                                                    io_loop=self.io_loop,
                                                                    )
        yield pub_channel.connect()
        self.tok = pub_channel.auth.gen_token('salt')
        self.connected = True
        raise tornado.gen.Return((opts['master'], pub_channel))

可以看出在上面的逻辑中, 无论是multi master还是single master, 最终都会调用salt.transport.client.AsyncPubChannel类的factory()和connect()方法来完成connect master, 目标文件在: ~/salt/transport/client.py里.

class AsyncPubChannel(AsyncChannel):
    '''
    Factory class to create subscription channels to the master's Publisher
    '''
    @classmethod
    def factory(cls, opts, **kwargs):
        # Default to ZeroMQ for now
        ttype = 'zeromq'

        # determine the ttype
        if 'transport' in opts:
            ttype = opts['transport']
        elif 'transport' in opts.get('pillar', {}).get('master', {}):
            ttype = opts['pillar']['master']['transport']

        # switch on available ttypes
        if ttype == 'zeromq':
            import salt.transport.zeromq
            return salt.transport.zeromq.AsyncZeroMQPubChannel(opts, **kwargs)
        elif ttype == 'raet':  # TODO:
            import salt.transport.raet
            return salt.transport.raet.AsyncRAETPubChannel(opts, **kwargs)
        elif ttype == 'tcp':
            if not cls._resolver:
                # TODO: add opt to specify number of resolver threads
                AsyncChannel._init_resolver()
            import salt.transport.tcp
            return salt.transport.tcp.AsyncTCPPubChannel(opts, **kwargs)
        elif ttype == 'local':  # TODO:
            import salt.transport.local
            return salt.transport.local.AsyncLocalPubChannel(opts, **kwargs)
        else:
            raise Exception('Channels are only defined for ZeroMQ and raet')
            # return NewKindOfChannel(opts, **kwargs)
            
        

现在基本都是基于zeromq通信的, salt.transport.zeromq.AsyncZeroMQPubChannel(opts, **kwargs)方法处理pub_channel, 目标类文件在: ~/salt/transport/zeromq.py

class AsyncZeroMQPubChannel(salt.transport.mixins.auth.AESPubClientMixin, salt.transport.client.AsyncPubChannel):
'''
A transport channel backed by ZeroMQ for a Salt Publisher to use to
publish commands to connected minions
'''
def __init__(self,
             opts,
             **kwargs):
    self.opts = opts
    self.ttype = 'zeromq'

    if 'io_loop' in kwargs:
        self.io_loop = kwargs['io_loop']
    else:
        self.io_loop = tornado.ioloop.IOLoop() #IOLoop网络事件

    self.hexid = hashlib.sha1(self.opts['id']).hexdigest()

    self.auth = salt.crypt.AsyncAuth(self.opts, io_loop=self.io_loop) #异步的网络加密验证

    self.serial = salt.payload.Serial(self.opts)

    self.context = zmq.Context() #ZMQ发布订阅模型(PUB/SUB)
    self._socket = self.context.socket(zmq.SUB) #启动zmq的SUB(client)端socket

    if self.opts['zmq_filtering']:
        # TODO: constants file for "broadcast"
        self._socket.setsockopt(zmq.SUBSCRIBE, 'broadcast')
        self._socket.setsockopt(zmq.SUBSCRIBE, self.hexid)
    else:
        self._socket.setsockopt(zmq.SUBSCRIBE, '')

    self._socket.setsockopt(zmq.SUBSCRIBE, '')
    self._socket.setsockopt(zmq.IDENTITY, self.opts['id']) #minion自己的conf中的id

    # TODO: cleanup all the socket opts stuff
    if hasattr(zmq, 'TCP_KEEPALIVE'):
        self._socket.setsockopt(
            zmq.TCP_KEEPALIVE, self.opts['tcp_keepalive']
        )
        self._socket.setsockopt(
            zmq.TCP_KEEPALIVE_IDLE, self.opts['tcp_keepalive_idle']
        )
        self._socket.setsockopt(
            zmq.TCP_KEEPALIVE_CNT, self.opts['tcp_keepalive_cnt']
        )
        self._socket.setsockopt(
            zmq.TCP_KEEPALIVE_INTVL, self.opts['tcp_keepalive_intvl']
        )

    recon_delay = self.opts['recon_default']

    if self.opts['recon_randomize']:
        recon_delay = randint(self.opts['recon_default'],
                              self.opts['recon_default'] + self.opts['recon_max']
                      )

        log.debug("Generated random reconnect delay between '{0}ms' and '{1}ms' ({2})".format(
            self.opts['recon_default'],
            self.opts['recon_default'] + self.opts['recon_max'],
            recon_delay)
        )

    log.debug("Setting zmq_reconnect_ivl to '{0}ms'".format(recon_delay))
    self._socket.setsockopt(zmq.RECONNECT_IVL, recon_delay)

    if hasattr(zmq, 'RECONNECT_IVL_MAX'):
        log.debug("Setting zmq_reconnect_ivl_max to '{0}ms'".format(
            self.opts['recon_default'] + self.opts['recon_max'])
        )

        self._socket.setsockopt(
            zmq.RECONNECT_IVL_MAX, self.opts['recon_max']
        )

    if self.opts['ipv6'] is True and hasattr(zmq, 'IPV4ONLY'):
        # IPv6 sockets work for both IPv6 and IPv4 addresses
        self._socket.setsockopt(zmq.IPV4ONLY, 0)

    if HAS_ZMQ_MONITOR and self.opts['zmq_monitor']:
        self._monitor = ZeroMQSocketMonitor(self._socket)
        self._monitor.start_io_loop(self.io_loop)

# TODO: this is the time to see if we are connected, maybe use the req channel to guess?
@tornado.gen.coroutine
def connect(self):
    if not self.auth.authenticated: #判断是否auth验证
        yield self.auth.authenticate()
    self.publish_port = self.auth.creds['publish_port']
    self._socket.connect(self.master_pub) #连接master

看到这里也就明白了, minion在启动后最终使用了ZMQ库的Pub/Sub模型, connect方法链接master机器.

至此salt-minion算是启动起来了 -_-


先去喝杯水, 我想静静 -:)

接下来需要狠狠脑补下zeroMQ等网络通信机制.

From reno

2015-07-10 14:59

转载于:https://www.cnblogs.com/renolei/p/4635837.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值