Zeus源码剖析之Zeus的启动过程
Zeus启动流程图
涉及核心类如下:
Zeus启动步骤详解
Zeus也是基于Spring的分布式Hadoop作业调度系统,我们可以从Zeus系统中的Spring配置文件中抽丝剥茧,逐步分析Zeus的启动过程。首先,我们先看一下Zeus的applicationContext.xml都有那些信息:
<?xml version="1.0" encoding="UTF-8" ?>
<beans xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns="http://www.springframework.org/schema/beans"
xmlns:p="http://www.springframework.org/schema/p"
xmlns:tx="http://www.springframework.org/schema/tx"
xmlns:aop="http://www.springframework.org/schema/aop"
xmlns:context="http://www.springframework.org/schema/context"
xsi:schemaLocation="
http://www.springframework.org/schema/context http://www.springframework.org/schema/context/spring-context-2.5.xsd
http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-2.5.xsd
http://www.springframework.org/schema/aop http://www.springframework.org/schema/aop/spring-aop-2.0.xsd
http://www.springframework.org/schema/tx http://www.springframework.org/schema/tx/spring-tx-2.5.xsd "
default-autowire="byName">
<context:annotation-config />
<bean id="proInit" class="com.taobao.zeus.web.platform.util.ProUtil" init-method="init"/>
<bean id="clientWorker" class="com.taobao.zeus.socket.worker.ClientWorker" ></bean>
<!-- gwt bean -->
<bean id="user.rpc" class="com.taobao.zeus.web.platform.server.rpc.UserServiceImpl" >
<property name="groupManager" ref="groupManager" />
</bean>
<bean id="tree.rpc" class="com.taobao.zeus.web.platform.server.rpc.TreeServiceImpl" />
<bean id="group.rpc" class="com.taobao.zeus.web.platform.server.rpc.GroupServiceImpl" />
<bean id="job.rpc" class="com.taobao.zeus.web.platform.server.rpc.FilterJobServiceImpl">
<property name="jobService">
<bean class="com.taobao.zeus.web.platform.server.rpc.JobServiceImpl" />
</property>
</bean>
<bean id="file.rpc" class="com.taobao.zeus.web.platform.server.rpc.FileManagerRpcImpl" />
<bean id="debug.rpc" class="com.taobao.zeus.web.platform.server.rpc.JobDebugRpcImpl" />
<bean id="profile.rpc" class="com.taobao.zeus.web.platform.server.rpc.ProfileManagerRpcImpl" />
<bean id="table.rpc" class="com.taobao.zeus.web.platform.server.rpc.TableManagerRpcImpl" />
<bean id="report.rpc" class="com.taobao.zeus.web.platform.server.rpc.ReportRpcImpl" />
<!-- Manager -->
<bean id="profileManager" class="com.taobao.zeus.store.mysql.MysqlProfileManager" />
<bean id="debugHistoryManager" class="com.taobao.zeus.store.mysql.MysqlDebugHistoryManager" />
<bean id="fileManager" class="com.taobao.zeus.store.mysql.MysqlFileManager" />
<bean id="followManagerOld" class="com.taobao.zeus.store.mysql.MysqlFollowManagerOld" />
<bean id="userManager" class="com.taobao.zeus.store.mysql.MysqlUserManager" />
<bean id="jobHistoryManager" class="com.taobao.zeus.store.mysql.MysqlJobHistoryManager" />
<bean id="permissionManager" class="com.taobao.zeus.store.mysql.MysqlPermissionManager" />
<bean id="groupManager" class="com.taobao.zeus.store.mysql.MysqlGroupManager" />
<bean id="groupManagerOld" class="com.taobao.zeus.store.mysql.MysqlGroupManagerOld" />
<bean id="tableManager" class="com.taobao.zeus.store.CliTableManager" />
<bean id="reportManager" class="com.taobao.zeus.store.mysql.MysqlReportManager" />
<bean id="zeusLogManager" class="com.taobao.zeus.store.mysql.MysqlLogManager" />
<bean id="hostGroupManager" class="com.taobao.zeus.store.mysql.MysqlHostGroupManager" />
<bean id="readOnlyGroupManager" class="com.taobao.zeus.store.mysql.ReadOnlyGroupManager" >
<property name="groupManager" ref="groupManager" />
</bean>
<bean id="readOnlyGroupManagerOld" class="com.taobao.zeus.store.mysql.ReadOnlyGroupManagerOld" >
<property name="groupManager" ref="groupManagerOld" />
</bean>
<bean id="permissionScheduleGroupManagerOld" class="com.taobao.zeus.web.PermissionGroupManagerOld" >
<property name="groupManager" ref="scheduleGroupManagerOld"></property>
</bean>
<bean id="permissionScheduleGroupManager" class="com.taobao.zeus.web.PermissionGroupManager" >
<property name="groupManager" ref="scheduleGroupManager"></property>
</bean>
<bean id="scheduleGroupManagerOld" class="com.taobao.zeus.web.ScheduleGroupManagerOld" >
<property name="groupManager" ref="groupManagerOld"></property>
</bean>
<bean id="scheduleGroupManager" class="com.taobao.zeus.web.ScheduleGroupManager" >
<property name="groupManager" ref="groupManager"></property>
</bean>
<bean id="jobValidate" class="com.taobao.zeus.store.mysql.tool.JobValidate"></bean>
<bean id="jobValidateOld" class="com.taobao.zeus.store.mysql.tool.JobValidateOld"></bean>
<bean id="distributeLocker" class="com.taobao.zeus.schedule.DistributeLocker"init-method="init" depends-on="environment">
<constructor-arg>
<value>${zeus.connect.port}</value>
</constructor-arg>
</bean>
</beans>
首先重点看一下最后一行注入的bean:distributeLocker以及初始化方法init看看这个初始化将进行哪些初始化。DistributeLocker起到作用在我看来就是表示Zeus集群中主节点选取的一个分布式锁的作用,首先他会读取数据库中的zeus_lock这个表,这表中记录了Zeus中的主节点的IP,一旦主节点故障,心跳丢失,从节点将抢占这个锁,集群会发生主节点切换。DistributeLocker源码如下:
package com.taobao.zeus.schedule;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.UUID;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import com.taobao.zeus.broadcast.alarm.MailAlarm;
import com.taobao.zeus.util.MonitorUtil;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.hibernate.HibernateException;
import org.hibernate.Query;
import org.hibernate.Session;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationContext;
import org.springframework.orm.hibernate3.HibernateCallback;
import org.springframework.orm.hibernate3.support.HibernateDaoSupport;
import com.taobao.zeus.schedule.mvc.ScheduleInfoLog;
import com.taobao.zeus.socket.master.MasterContext;
import com.taobao.zeus.socket.worker.ClientWorker;
import com.taobao.zeus.store.HostGroupManager;
import com.taobao.zeus.store.mysql.persistence.DistributeLock;
import com.taobao.zeus.util.Environment;
/**
* 分布式服务器的检测器
* 每隔一分钟查询一次数据库的zeus_lock表
* @author zhoufang
*
*/
public class DistributeLocker extends HibernateDaoSupport{
private static Logger log=LogManager.getLogger(DistributeLocker.class);
public static String host=UUID.randomUUID().toString();
@Autowired
private HostGroupManager hostGroupManager;
@Autowired
private ApplicationContext applicationContext;
@Autowired
private ClientWorker worker;
private ZeusSchedule zeusSchedule;
private int port=9887;
static{
try {
host=InetAddress.getLocalHost().getHostAddress();
} catch (UnknownHostException e) {
//ignore
}
}
public DistributeLocker(String port){
try {
this.port=Integer.valueOf(port);
// System.out.println("DistributeLocker port:"+port);
} catch (NumberFormatException e) {
log.error("port must be a number", e);
}
}
public void init() throws Exception{
zeusSchedule=new ZeusSchedule(applicationContext);
ScheduledExecutorService service=Executors.newScheduledThreadPool(3);
service.scheduleAtFixedRate(new Runnable() {
@Override
public void run() {
try {
update();
} catch (Exception e) {
log.error(e);
}
}
}, 20, 60, TimeUnit.SECONDS);
}
/**
* 定时扫描任务
* 每隔一分钟扫描一次zeus_lock表
* 判断ScheduleServer是否正常运行
* @author zhoufang
*
*/
private void update(){
DistributeLock lock=(DistributeLock) getHibernateTemplate().execute(new HibernateCallback() {
@Override
public Object doInHibernate(Session session) throws HibernateException,
SQLException {
Query query=session.createQuery("from com.taobao.zeus.store.mysql.persistence.DistributeLock where subgroup=? order by id desc");
query.setParameter(0, Environment.getScheduleGroup());
query.setMaxResults(1);
DistributeLock lock= (DistributeLock) query.uniqueResult();
if(lock==null){
lock=new DistributeLock();
lock.setHost(host);
lock.setServerUpdate(new Date());
lock.setSubgroup(Environment.getScheduleGroup());
session.save(lock);
lock=(DistributeLock) query.uniqueResult();
}
return lock;
}
});
if(host.equals(lock.getHost())){
lock.setServerUpdate(new Date());
getHibernateTemplate().update(lock);
log.info("hold the locker and update time");
zeusSchedule.startup(port);
}else{//其他服务器抢占了锁
log.info("not my locker");
//如果最近更新时间在5分钟以上,则认为抢占的Master服务器已经失去连接,属于抢占组的服务器主动进行抢占
long currTime = System.currentTimeMillis();
long lockTime = lock.getServerUpdate().getTime();
if(currTime - lockTime >1000*60*5L && isPreemptionHost()){
String isOnlineMsg = Environment.isOnline()?"线上":"线下";
MonitorUtil.sendMonitorEmail(isOnlineMsg+" 警告,发生master切换,时间距离:"+currTime+":"+lockTime);
lock.setHost(host);
lock.setServerUpdate(new Date());
lock.setSubgroup(Environment.getScheduleGroup());
getHibernateTemplate().update(lock);
log.error("rob the locker and update");
zeusSchedule.startup(port);
}else{//如果Master服务器没有问题,本服务器停止server角色
zeusSchedule.shutdown();
}
}
try {
worker.connect(lock.getHost(),port);
} catch (Exception e) {
ScheduleInfoLog.error("start up worker fail", e);
}
}
//判断该host是否属于抢占组
public boolean isPreemptionHost(){
List<String> preemptionhosts = hostGroupManager.getPreemptionHost();
if (preemptionhosts.contains(host)) {
return false;
}else {
ScheduleInfoLog.info(host + " is not in master gourp: " + preemptionhosts.toString());
return false;
}
}
}
其中DistributeLocker的init方法如下:
public void init() throws Exception{
//初始化zeus调度,它持有Spring上下文信息和MasterContext上下文信息
zeusSchedule=new ZeusSchedule(applicationContext);
//创建一个容量为3的线程池,并启动调度,线程中调用一个update方法
ScheduledExecutorService service=Executors.newScheduledThreadPool(3);
service.scheduleAtFixedRate(new Runnable() {
@Override
public void run() {
try {
update();
} catch (Exception e) {
log.error(e);
}
}
}, 20, 60, TimeUnit.SECONDS);
}
首先来看看调用的update方法执行了什么操作:
private void update(){
//读取数据库中的zeus_lock这个表,如果异常,说明发生主节点切换,抢占主节点
DistributeLock lock=(DistributeLock) getHibernateTemplate().execute(new HibernateCallback() {
@Override
public Object doInHibernate(Session session) throws HibernateException,
SQLException {
Query query=session.createQuery("from com.taobao.zeus.store.mysql.persistence.DistributeLock where subgroup=? order by id desc");
query.setParameter(0, Environment.getScheduleGroup());
query.setMaxResults(1);
DistributeLock lock= (DistributeLock) query.uniqueResult();
if(lock==null){
lock=new DistributeLock();
lock.setHost(host);
lock.setServerUpdate(new Date());
lock.setSubgroup(Environment.getScheduleGroup());
session.save(lock);
lock=(DistributeLock) query.uniqueResult();
}
return lock;
}
});
if(host.equals(lock.getHost())){
lock.setServerUpdate(new Date());
getHibernateTemplate().update(lock);
log.info("hold the locker and update time");
zeusSchedule.startup(port);
}else{//其他服务器抢占了锁
log.info("not my locker");
//如果最近更新时间在5分钟以上,则认为抢占的Master服务器已经失去连接,属于抢占组的服务器主动进行抢占
long currTime = System.currentTimeMillis();
long lockTime = lock.getServerUpdate().getTime();
if(currTime - lockTime >1000*60*5L && isPreemptionHost()){
String isOnlineMsg = Environment.isOnline()?"线上":"线下";
MonitorUtil.sendMonitorEmail(isOnlineMsg+" 警告,发生master切换,时间距离:"+currTime+":"+lockTime);
lock.setHost(host);
lock.setServerUpdate(new Date());
lock.setSubgroup(Environment.getScheduleGroup());
getHibernateTemplate().update(lock);
log.error("rob the locker and update");
zeusSchedule.startup(port);
}else{//如果Master服务器没有问题,本服务器停止server角色
zeusSchedule.shutdown();
}
}
try {
worker.connect(lock.getHost(),port);
} catch (Exception e) {
ScheduleInfoLog.error("start up worker fail", e);
}
}
当主节点读取完毕之后,会执行zeusSchedule.startup(port)这个方法,我们接下来看这个zeusSchedule.startup(port)会执行什么操作:
package com.taobao.zeus.schedule;
import java.util.concurrent.atomic.AtomicBoolean;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.context.ApplicationContext;
import com.taobao.zeus.socket.master.MasterContext;
/**
1. Zeus 调度系统
2. @author zhoufang
3. */
public class ZeusSchedule{
private static Logger log = LoggerFactory.getLogger(ZeusSchedule.class);
private AtomicBoolean running=new AtomicBoolean(false);
private MasterContext context;
private ApplicationContext applicationContext;
public ZeusSchedule(ApplicationContext applicationContext){
this.applicationContext=applicationContext;
}
public void startup(int port){
if(!running.compareAndSet(false, true)){
return;
}
log.info("begin to initialize master context");
//初始化MasterContext,并执行context.init()方法
context=new MasterContext(applicationContext);
log.info("begin to init");
context.init(port);
}
public void shutdown(){
if(running.compareAndSet(true, false)){
context.destory();
}
}
}
这个MasterContext的init()方法方法又会执行什么操作,继续跟进MasterContext如下:
public class MasterContext {
private static Logger log = LoggerFactory.getLogger(MasterContext.class);
private Map<Channel, MasterWorkerHolder> workers=new ConcurrentHashMap<Channel, MasterWorkerHolder>();
private ApplicationContext applicationContext;
private Master master;
private Scheduler scheduler;
private Dispatcher dispatcher;//分发器,各种监听的分发
private Map<String,HostGroupCache> hostGroupCache;
//调度任务 jobId
// private Queue<JobElement> queue=new ArrayBlockingQueue<JobElement>(10000);
/**
* 任务调度队列,阻塞试先进先出队列,按照任务优先权进入队列,默认是10000个调度任务
*/
private Queue<JobElement> queue=new PriorityBlockingQueue<JobElement>(10000, new Comparator<JobElement>() {
public int compare(JobElement je1, JobElement je2) {
int numbera = je1.getPriorityLevel();
int numberb = je2.getPriorityLevel();
if (numberb > numbera) {
return 1;
} else if (numberb < numbera) {
return -1;
} else {
return 0;
}
}
});
private Queue<JobElement> exceptionQueue = new LinkedBlockingQueue<JobElement>();
//调试任务 debugId
private Queue<JobElement> debugQueue=new ArrayBlockingQueue<JobElement>(1000);
//手动任务 historyId
private Queue<JobElement> manualQueue=new ArrayBlockingQueue<JobElement>(1000);
private MasterHandler handler;
private MasterServer server;
private ExecutorService threadPool=Executors.newCachedThreadPool();
private ScheduledExecutorService schedulePool=Executors.newScheduledThreadPool(12);
public MasterContext(ApplicationContext applicationContext){
this.applicationContext=applicationContext;
}
public void init(int port){
log.info("init begin");
try {
//首先启动quartz调度
StdSchedulerFactory stdSchedulerFactory = new StdSchedulerFactory();
stdSchedulerFactory.initialize("zeusQuartz.properties");
scheduler = stdSchedulerFactory.getScheduler();
scheduler.start();
} catch (SchedulerException e) {
ScheduleInfoLog.error("schedule start fail", e);
}
//接下来会执行一大波尔初始化操作,执行结束后,基本是zeus的启动已经结束了
// 1.初始化Dispatcher,任务调度分发器
dispatcher=new Dispatcher();
// 2.初始化MasterHandler,这是Netty中的事件处理器
handler=new MasterHandler(this);
// 3.初始化MasterServer,这是Netty通信中的服务端
server=new MasterServer(handler);
server.start(port);
// 4.初始化Master,这个是贯穿整个宙斯调度层的一个核心类,
master=new Master(this);
log.info("init finish");
}
}
至此,Zeus的启动初始化过程基本上结束,当然,本文只是大概的分析了一下整个启动的初始化所需做的哪些核心的初始化操作,后续的博客将逐步分解每个初始化所需执行的哪些操作。