1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
def
start()
:
Unit
=
synchronized {
if
(state
==
Started) {
throw
new
SparkException(
"StreamingContext has already been started"
)
}
if
(state
==
Stopped) {
throw
new
SparkException(
"StreamingContext has already been stopped"
)
}
//检查是否有duration
validate()
sparkContext.setCallSite(DStream.getCreationSite())
//第1步,启动入口
scheduler.start()
//记录状态,初始化时为Initialized,启动Started,最终Stopped
state
=
Started
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
|
//start之后,等待接收msg处理job
def
start()
:
Unit
=
synchronized {
if
(eventActor !
=
null
)
return
// scheduler has already been started
logDebug(
"Starting JobScheduler"
)
eventActor
=
ssc.env.actorSystem.actorOf(Props(
new
Actor {
def
receive
=
{
case
event
:
JobSchedulerEvent
=
> processEvent(event)
}
}),
"JobScheduler"
)
//第2步启动listenerBus,receiverTracker,jobGenerator
listenerBus.start()
//StreamingListenerBus实例
receiverTracker
=
new
ReceiverTracker(ssc)
receiverTracker.start()
jobGenerator.start()
logInfo(
"Started JobScheduler"
)
}
|
1
2
|
private
val
timer
=
new
RecurringTimer(clock, ssc.graph.batchDuration.milliseconds,
longTime
=
> eventActor ! GenerateJobs(
new
Time(longTime)),
"JobGenerator"
)
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
|
/** Start generation of jobs */
def
start()
:
Unit
=
synchronized {
if
(eventActor !
=
null
)
return
// generator has already been started
eventActor
=
ssc.env.actorSystem.actorOf(Props(
new
Actor {
def
receive
=
{
case
event
:
JobGeneratorEvent
=
> processEvent(event)
//处理不同的消息
}
}),
"JobGenerator"
)
if
(ssc.isCheckpointPresent) {
restart()
}
else
{
//启动job
startFirstTime()
}
}
/** Starts the generator for the first time */
//启动job
private
def
startFirstTime() {
val
startTime
=
new
Time(timer.getStartTime())
//DStreamGraph
graph.start(startTime - graph.batchDuration)
//RecurringTimer,callback函数 longTime => eventActor ! GenerateJobs(new Time(longTime))
//第三步,启动timer
timer.start(startTime.milliseconds)
logInfo(
"Started JobGenerator at "
+ startTime)
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
|
/**
* Start at the given start time.
* 启动守护线程去不断的获取流数据组成的job对象,调用longTime => eventActor ! GenerateJobs(new Time(longTime)发送msg
*/
def
start(startTime
:
Long)
:
Long
=
synchronized {
nextTime
=
startTime
//第4步
thread.start()
logInfo(
"Started timer for "
+ name +
" at time "
+ nextTime)
nextTime
}
private
val
thread
=
new
Thread(
"RecurringTimer - "
+ name) {
//守护线程
setDaemon(
true
)
override
def
run() { loop }
}
/**
* Repeatedly call the callback every interval.
*/
private
def
loop() {
try
{
while
(!stopped) {
clock.waitTillTime(nextTime)
//第4步使用akka发送msg(GenerateJobs)
callback(nextTime)
prevTime
=
nextTime
nextTime +
=
period
logDebug(
"Callback for "
+ name +
" called at time "
+ prevTime)
}
}
catch
{
case
e
:
InterruptedException
=
>
}
}
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
|
/** Processes all events */
private
def
processEvent(event
:
JobGeneratorEvent) {
logDebug(
"Got event "
+ event)
event
match
{
case
GenerateJobs(time)
=
> generateJobs(time)
case
ClearMetadata(time)
=
> clearMetadata(time)
case
DoCheckpoint(time)
=
> doCheckpoint(time)
case
ClearCheckpointData(time)
=
> clearCheckpointData(time)
}
}
/** Generate jobs and perform checkpoint for the given `time`. */
//流数据根据间隔时间不断的调用此函数
private
def
generateJobs(time
:
Time) {
// Set the SparkEnv in this thread, so that job generation code can access the environment
// Example: BlockRDDs are created in this thread, and it needs to access BlockManager
// Update: This is probably redundant after threadlocal stuff in SparkEnv has been removed.
SparkEnv.set(ssc.env)
Try {
//第5步,分配received blocks为第7步提交任务做准备
jobScheduler.receiverTracker.allocateBlocksToBatch(time)
// allocate received blocks to batch
//第6步,生成各个小任务的job----outputStream.generateJob
graph.generateJobs(time)
// generate jobs using allocated block
}
match
{
case
Success(jobs)
=
>
val
receivedBlockInfos
=
jobScheduler.receiverTracker.getBlocksOfBatch(time).mapValues {
_
.toArray }
//第7步提交任务
jobScheduler.submitJobSet(JobSet(time, jobs, receivedBlockInfos))
case
Failure(e)
=
>
jobScheduler.reportError(
"Error generating jobs for time "
+ time, e)
}
eventActor ! DoCheckpoint(time)
}
|
1
2
3
4
5
6
7
8
9
|
def
generateJobs(time
:
Time)
:
Seq[Job]
=
{
logDebug(
"Generating jobs for time "
+ time)
val
jobs
=
this
.synchronized {
outputStreams.flatMap(outputStream
=
> outputStream.generateJob(time))
}
logDebug(
"Generated "
+ jobs.length +
" jobs for time "
+ time)
jobs
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
|
/** Method that generates a RDD for the given time */
/** DStream的核心函数,每一个继承于此的子类都需要实现此compute()函数。而根据不同的
DStream, compute()函数都需要实现其特定功能,而计算的结果则是返回计算好的RDD*/
def
compute (validTime
:
Time)
:
Option[RDD[T]]
/**
* Generate a SparkStreaming job for the given time. This is an internal method that
* should not be called directly. This default implementation creates a job
* that materializes the corresponding RDD. Subclasses of DStream may override this
* to generate their own jobs.
*/
private
[streaming]
def
generateJob(time
:
Time)
:
Option[Job]
=
{
getOrCompute(time)
match
{
//注意这里返回的Some(rdd)是 getOrCompute返回的
case
Some(rdd)
=
> {
val
jobFunc
=
()
=
> {
val
emptyFunc
=
{ (iterator
:
Iterator[T])
=
> {} }
context.sparkContext.runJob(rdd, emptyFunc)
}
Some(
new
Job(time, jobFunc))
}
case
None
=
> None
}
}
/**
* Get the RDD corresponding to the given time; either retrieve it from cache
* or compute-and-cache it.
* 此方法是最后在sparkcontext调用 runjob时得到的RDD,为stream的核心
* 每个RDD有不同的时间key ,保存在hashmap当在
*/
private
[streaming]
def
getOrCompute(time
:
Time)
:
Option[RDD[T]]
=
{
// If RDD was already generated, then retrieve it from HashMap,
// or else compute the RDD
generatedRDDs.get(time).orElse {
// Compute the RDD if time is valid (e.g. correct time in a sliding window)
// of RDD generation, else generate nothing.
if
(isTimeValid(time)) {
/** 对于每一次不同时间的计算,DStream会调用子类所实现的compute()函数来计算产生新的RDD */
// Set the thread-local property for call sites to this DStream's creation site
// such that RDDs generated by compute gets that as their creation site.
// Note that this `getOrCompute` may get called from another DStream which may have
// set its own call site. So we store its call site in a temporary variable,
// set this DStream's creation site, generate RDDs and then restore the previous call site.
val
prevCallSite
=
ssc.sparkContext.getCallSite()
ssc.sparkContext.setCallSite(creationSite)
// Disable checks for existing output directories in jobs launched by the streaming
// scheduler, since we may need to write output to an existing directory during checkpoint
// recovery; see SPARK-4835 for more details. We need to have this call here because
// compute() might cause Spark jobs to be launched.
val
rddOption
=
PairRDDFunctions.disableOutputSpecValidation.withValue(
true
) {
compute(time)
}
ssc.sparkContext.setCallSite(prevCallSite)
rddOption.foreach {
case
newRDD
=
>
// Register the generated RDD for caching and checkpointing
if
(storageLevel !
=
StorageLevel.NONE) {
newRDD.persist(storageLevel)
logDebug(s
"Persisting RDD ${newRDD.id} for time $time to $storageLevel"
)
}
if
(checkpointDuration !
=
null
&& (time - zeroTime).isMultipleOf(checkpointDuration)) {
newRDD.checkpoint()
logInfo(s
"Marking RDD ${newRDD.id} for time $time for checkpointing"
)
}
generatedRDDs.put(time, newRDD)
}
rddOption
}
else
{
None
}
}
}
|
1
2
3
4
5
6
7
8
9
10
|
def
submitJobSet(jobSet
:
JobSet) {
if
(jobSet.jobs.isEmpty) {
logInfo(
"No jobs added for time "
+ jobSet.time)
}
else
{
jobSets.put(jobSet.time, jobSet)
//第8步多线程执行jobSet
jobSet.jobs.foreach(job
=
> jobExecutor.execute(
new
JobHandler(job)))
logInfo(
"Added jobs for time "
+ jobSet.time)
}
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
private
class
JobHandler(job
:
Job)
extends
Runnable {
def
run() {
//9.1
eventActor ! JobStarted(job)
// Disable checks for existing output directories in jobs launched by the streaming scheduler,
// since we may need to write output to an existing directory during checkpoint recovery;
// see SPARK-4835 for more details.
PairRDDFunctions.disableOutputSpecValidation.withValue(
true
) {
//9.2
job.run()
}
//9.3
eventActor ! JobCompleted(job)
}
}
|