2021SC@SDUSC
wait-for-workers-launch函数定义如下:
wait-for-workers-launch函数
;; wait-for-workers-launch函数等待所有worker启动完成
( defn- wait-for-workers-launch [ conf ids ]
( let [ start-time ( current-time-secs )]
( doseq [ id ids ]
;; 调用wait-for-worker-launch函数
( wait-for-worker-launch conf id start-time))
))
wait-for-worker-launch函数定义如下:
wait-for-worker-launch函数
;; wait-for-worker-launch函数等待worker启动完成,worker启动完成的条件是:如果worker在规定的心跳超时时间内有一次心跳那么就说明worker成功启动
( defn- wait-for-worker-launch [ conf id start-time ]
( let [ state ( worker-state conf id )]
( loop []
( let [ hb ( .get state LS-WORKER-HEARTBEAT )]
( when ( and
( not hb)
( <
( - ( current-time-secs) start-time)
( conf SUPERVISOR-WORKER-START-TIMEOUT-SECS)
))
( log-message id " still hasn't started")
( Time/sleep 500)
( recur)
)))
( when-not ( .get state LS-WORKER-HEARTBEAT)
( log-message "Worker " id " failed to start")
)))
mk-synchronize-supervisor函数定义如下:
mk-synchronize-supervisor函数
;; mk-synchronize-supervisor函数返回一个名字为"this"的函数,
( defn mk-synchronize-supervisor [ supervisor sync-processes event-manager processes-event-manager ]
( fn this []
;; conf绑定集群配置信息
( let [ conf ( :conf supervisor)
;; storm-cluster-state绑定StormClusterState对象
storm-cluster-state ( :storm-cluster-state supervisor)
;; isupervisor绑定实现了ISupervisor接口的实例
^ ISupervisor isupervisor ( :isupervisor supervisor)
;; local-state绑定LocalState实例
^ LocalState local-state ( :local-state supervisor)
;; sync-callback绑定一个匿名函数,这个匿名函数的主要功能就是将上面定义的"this"函数添加到event-manager中,这样"this"函数将会在一个新的线程内执行
;; 每次执行,都需要再一次把sync-callback注册到zookeeper中作为回调函数,以保证下次可以被继续触发,当zookeeper的子节点"/assignments"发生变化时执行回调函数sync-callback
sync-callback ( fn [ & ignored ] ( .add event-manager this))
;; assignment-versions绑定带有版本号的分配信息,topology-id->分配信息的map
assignment-versions @( :assignment-versions supervisor)
;; assignments-snapshot绑定topoloy-id->分配信息AssignmentInfo对象的map,versions绑定带有版本号的分配信息,assignments-snapshot函数从zookeeper的子节点"/assignments"获取分配信息(当前集群分配信息快照),并将回调函数添加到子节点"/assignments"上,assignments-snapshot函数参见其定义部分
{ assignments-snapshot :assignments versions :versions } ( assignments-snapshot
storm-cluster-state sync-callback
assignment-versions)
;; 调用read-storm-code-locations函数获取topology-id->nimbus上该topology代码目录的map
storm-code-map ( read-storm-code-locations assignments-snapshot)
;; read-downloaded-storm-ids函数从supervisor本地的"{storm.local.dir}/stormdist"目录读取已经下载了代码jar包的topology-id
downloaded-storm-ids ( set ( read-downloaded-storm-ids conf))
;; all-assignment绑定该supervisor上的所有分配信息,即port->LocalAssignment对象的map
all-assignment ( read-assignments
assignments-snapshot
( :assignment-id supervisor))
;; 调用isupervisor对象的confirmAssigned函数验证all-assignment的key即port的有效性,将通过验证的保存到new-assignment中。isupervisor对象是在standalone-supervisor函数中创建的,查看standalone-supervisor函数,我们可以发现isupervisor对象的confirmAssigned函数只是返回true,所以new-assignment=all-assignment
new-assignment ( ->> all-assignment
( filter-key #( .confirmAssigned isupervisor %)))
;; assigned-storm-ids绑定分配给该supervisor的topology-id的集合
assigned-storm-ids ( assigned-storm-ids-from-port-assignments new-assignment)
;; existing-assignment绑定该supervisor上已经存在的分配信息
existing-assignment ( .get local-state LS-LOCAL-ASSIGNMENTS )]
( log-debug "Synchronizing supervisor")
( log-debug "Storm code map: " storm-code-map)
( log-debug "Downloaded storm ids: " downloaded-storm-ids)
( log-debug "All assignment: " all-assignment)
( log-debug "New assignment: " new-assignment)
;; download code first
;; This might take awhile
;; - should this be done separately from usual monitoring?
;; should we only download when topology is assigned to this supervisor?
;; storm-code-map绑定当前集群上已分配的所有topology-id->nimbus上代码jar包目录的键值对的map
( doseq [[ storm-id master-code-dir ] storm-code-map ]
;; 如果downloaded-storm-ids集合不包含该storm-id,且assigned-storm-ids集合包含该storm-id(表明该storm-id需要在该superior上运行,但是该storm-id的代码jar包还没有从nimbus服务器下载到本地),则调用download-storm-code函数下载代码jar包
( when ( and ( not ( downloaded-storm-ids storm-id))
( assigned-storm-ids storm-id))
( log-message "Downloading code for storm id "
storm-id
" from "
master-code-dir)
;; 从nimbus服务器上下载该storm-id相关的代码jar包,序列化后的topology对象,运行时所需的配置信息,并将其保存到"{storm.local.dir}/nimbus/stormdist/{storm-id}/"目录
( download-storm-code conf storm-id master-code-dir)
( log-message "Finished downloading code for storm id "
storm-id
" from "
master-code-dir)
))
( log-debug "Writing new assignment "
( pr-str new-assignment))
;; existing-assignment与new-assignment的差集表示不需要在该supervisor上运行的分配的集合,所以要把这些分配对应的worker关闭
( doseq [p ( set/difference ( set ( keys existing-assignment))
( set ( keys new-assignment )))]
;; 当前storm版本0.9.2中,killedWorker为空实现,所以什么都没做
( .killedWorker isupervisor ( int p)))
;; assigned函数为空实现,什么也没有做
( .assigned isupervisor ( keys new-assignment))
;; 将最新分配信息new-assignment保存到local-state数据库中
( .put local-state
LS-LOCAL-ASSIGNMENTS
new-assignment)
;; 将带有版本号的分配信息versions存入supervisor缓存:assignment-versions中
( swap! ( :assignment-versions supervisor) versions)
;; 重新设置supervisor缓存的:curr-assignment值为new-assignment,即保存当前storm集群上最新分配信息
( reset! ( :curr-assignment supervisor) new-assignment)
;; remove any downloaded code that's no longer assigned or active
;; important that this happens after setting the local assignment so that
;; synchronize-supervisor doesn't try to launch workers for which the
;; resources don't exist
;; 如果当前supervisor服务器的操作系统是"Windows_NT"系统,那么执行shutdown-disallowed-workers函数,关闭状态为:disallowed的worker
( if on-windows? ( shutdown-disallowed-workers supervisor))
;; 遍历downloaded-storm-ids集合,该集合内存放了已经下载了jar包等信息的topology的id
( doseq [ storm-id downloaded-storm-ids ]
;; 如果storm-id不在assigned-storm-ids集合内,则递归删除"{storm.local.dir}/supervisor/stormdist/{storm-id}"目录。assigned-storm-ids表示当前需要在该supervisor上运行的topology的id
( when-not ( assigned-storm-ids storm-id)
( log-message "Removing code for storm id "
storm-id)
( try
( rmr ( supervisor-stormdist-root conf storm-id))
( catch Exception e ( log-message ( .getMessage e))))
))
;; 将sync-processes函数添加到processes-event-manager事件管理器中,这样就可以在一个单独线程内执行sync-processes函数。因为sync-processes函数比较耗时,所以需要在一个新的线程内执行
( .add processes-event-manager sync-processes)
)))
assignments-snapshot函数定义如下:
assignments-snapshot函数
;; assignments-snapshot函数从zookeeper的子节点"/assignments"获取分配信息,并将回调函数添加到子节点"/assignments"上,assignment-versions绑定该supervisor本地缓存的带有版本号的分配信息
( defn- assignments-snapshot [ storm-cluster-state callback assignment-versions ]
;; storm-ids绑定已分配的topology-id的集合,获取/assignments的子节点列表,如果callback不为空,将其赋值给assignments-callback,并对/assignments添加"节点观察",这样supervisor就能感知集群是否有新的assignment或者有assignment被删除
( let [ storm-ids ( .assignments storm-cluster-state callback )]
;; new-assignments绑定最新分配信息
( let [ new-assignments
( ->>
;; sid绑定topology-id
( dofor [ sid storm-ids ]
;; recorded-version绑定该supervisor上缓存的该sid的分配信息版本号
( let [ recorded-version ( :version ( get assignment-versions sid ))]
;; assignment-version绑定zookeeper上"/assignments/{sid}"节点数据及其版本号,并注册回调函数
( if-let [ assignment-version ( .assignment-version storm-cluster-state sid callback )]
;; 如果缓存的分配版本号和zookeeper上获取的分配版本号相等,则返回sid->缓存的分配信息的map,否则从zookeeper的"/assignments/{sid}"节点重新获取带有版本号的分配信息,并注册回调函数,这样supervisor就能感知某个已存在的assignment是否被重新分配
( if ( = assignment-version recorded-version)
{ sid ( get assignment-versions sid )}
{ sid ( .assignment-info-with-version storm-cluster-state sid callback )})
;; 如果从zookeeper上获取分配信息失败,值为{sid nil}
{ sid nil })))
;; 将dofor结果进行合并,形如:{sid_1 {:data data_1 :version version_1}, sid_2 {:data data_2 :version version_2},......sid_n {:data data_n :version version_n} }
( apply merge)
;; 保留值不空的键值对
( filter-val not-nil? ))]
;; 返回的map形如:{:assignments {sid_1 data_1, sid_2 data_2, ...... , sid_n data_n}, :versions {sid_1 {:data data_1 :version version_1}, sid_2 {:data data_2 :version version_2},......sid_n {:data data_n :version version_n} } }
;; data_x是一个AssignmentInfo对象,AssignmentInfo对象包含对应的nimbus上的代码目录,所有task的启动时间,每个task与机器、端口的映射
{ :assignments ( into {} ( for [[ k v ] new-assignments ] [ k ( :data v )]))
:versions new-assignments })))
read-assignments函数定义如下:
read-assignments函数
( defn- read-assignments
"Returns map from port to struct containing :storm-id and :executors"
;; assignments-snapshot绑定topology-id->分配信息AssignmentInfo对象的map,assignment-id绑定supervisor-id
[ assignments-snapshot assignment-id ]
;; 遍历read-my-executors函数返回结果,检查是否存在多个topology分配到同一个端口,如果存在则抛出异常。检查的方式特别巧妙,通过对返回结果调用merge-with函数,如果返回结果中存在相同的port,那么就会调用
;; 匿名函数(fn [& ignored] ......),这样就会抛出异常
( ->> ( dofor [ sid ( keys assignments-snapshot )] ( read-my-executors assignments-snapshot sid assignment-id))
( apply merge-with ( fn [ & ignored ] ( throw-runtime "Should not have multiple topologies assigned to one port")))))
read-my-executor函数定义如下:
read-my-executor函数
;; assignments-snapshot绑定topology-id->分配信息AssignmentInfo对象的map,assignment-id绑定supervisor-id,storm-id为topoloy-id
( defn- read-my-executors [ assignments-snapshot storm-id assignment-id ]
( let [ assignment ( get assignments-snapshot storm-id)
;; my-executors绑定分配给该supervisor的executor信息,即executor->node+port的map
my-executors ( filter ( fn [[ _ [ node _ ]]] ( = node assignment-id))
( :executor->node+port assignment))
;; port-executors绑定port->executor-id集合的map,merge-with函数的作用就是对key相同的value调用concat函数
port-executors ( apply merge-with
concat
( for [[ executor [ _ port ]] my-executors ]
{ port [ executor ]}
))]
;; 返回port->LocalAssignment对象的map,LocalAssignment包含两个属性:topology-id和executor-id集合
( into {} ( for [[ port executors ] port-executors ]
;; need to cast to int b/c it might be a long (due to how yaml parses things)
;; doall is to avoid serialization/deserialization problems with lazy seqs
[( Integer. port) ( LocalAssignment. storm-id ( doall executors ))]
))))
download-storm-code函数定义如下:
download-storm-code函数
( defmulti download-storm-code cluster-mode)
download-storm-code 函数是一个 "多重函数" ,根据 cluster-mode 函数的返回值决定调用哪个函数, cluster-mode 函数可能返回关键字 :distributed 和 :local ,如果返回 :distributed ,那么会调用下面这个函数。
( defmethod download-storm-code
;; master-code-dir绑定storm-id的代码jar包在nimbus服务器上的路径
:distributed [ conf storm-id master-code-dir ]
;; Downloading to permanent location is atomic
;; tmproot绑定supervisor本地路径"{storm.local.dir}/supervisor/tmp/{uuid}",临时存放从nimbus上下载的代码jar包
( let [ tmproot ( str ( supervisor-tmp-dir conf) file-path-separator ( uuid))
;; stormroot绑定该storm-id的代码jar包在supervisor上的路径"{storm.local.dir}/supervisor/stormdist/{storm-id}"
stormroot ( supervisor-stormdist-root conf storm-id )]
;; 创建临时目录tmproot
( FileUtils/forceMkdir ( File. tmproot))
;; 将nimbus服务器上的"{storm.local.dir}/nimbus/stormdist/{storm-id}/stormjar.jar"文件下载到supervisor服务器的tmproot目录中,stormjar.jar包含这个topology所有代码
( Utils/downloadFromMaster conf ( master-stormjar-path master-code-dir) ( supervisor-stormjar-path tmproot))
;; 将nimbus服务器上的"{storm.local.dir}/nimbus/stormdist/{storm-id}/stormcode.ser"文件下载到supervisor服务器的tmproot目录中,stormcode.ser是这个topology对象的序列化
( Utils/downloadFromMaster conf ( master-stormcode-path master-code-dir) ( supervisor-stormcode-path tmproot))
;; 将nimbus服务器上的"{storm.local.dir}/nimbus/stormdist/{storm-id}/stormconf.ser"文件下载到supervisor服务器的tmproot目录中,stormconf.ser包含运行这个topology的配置
( Utils/downloadFromMaster conf ( master-stormconf-path master-code-dir) ( supervisor-stormconf-path tmproot))
;; RESOURCES-SUBDIR值为字符串"resources",extract-dir-from-jar函数主要作用就是将jar包解压,然后将jar包中路径以"resources"开头的文件解压到"{tmproot}/resources/......"目录
( extract-dir-from-jar ( supervisor-stormjar-path tmproot) RESOURCES-SUBDIR tmproot)
;; 将临时目录tmproot中的文件剪切到stormroot目录中,这样"{storm.local.dir}/nimbus/stormdist/{storm-id}/"目录中将包括resources目录,stormjar.jar文件,stormcode.ser文件,stormconf.ser文件
( FileUtils/moveDirectory ( File. tmproot) ( File. stormroot))
))
extract-dir-from-jar函数定义如下:
extract-dir-from-jar函数
;; jarpath标识jar路径,dir标识"resources",destdir标识"{tmproot}"路径
( defn extract-dir-from-jar [ jarpath dir destdir ]
( try-cause
;; 使用类ZipFile来解压jar包,jarpath绑定ZipFile对象
( with-open [ jarpath ( ZipFile. jarpath )]
;; 调用entries方法,返回一个枚举对象,然后调用enumeration-seq函数获取文件的ZIP条目对象
( let [ entries ( enumeration-seq ( .entries jarpath ))]
;; 遍历entries中路径以"resources"开头的文件
( doseq [ file ( filter ( fn [ entry ]( and ( not ( .isDirectory entry)) ( .startsWith ( .getName entry) dir))) entries )]
;; 在"tmproot"目录中创建文件的完整父路径
( .mkdirs ( .getParentFile ( File. destdir ( .getName file))))
;; 将文件复制到"{tmproot}/{在压缩文件中的路径}"
( with-open [ out ( FileOutputStream. ( File. destdir ( .getName file )))]
( io/copy ( .getInputStream jarpath file) out)))))
( catch IOException e
( log-message "Could not extract " dir " from " jarpath))))
以上就是storm启动supervisor的完整流程,启动supervisor的工作主要是在mk-supervisor函数中进行的,所以阅读该部分源码时,要首先从该函数入手,然后依次分析在该函数中所调用的其他函数,根据函数的控制流程分析每个函数。