/usr/local/jdk1.7/bin/java -cp /usr/local/spark/lib/postgresql-9.4-1201.jdbc41.jar:/usr/local/spark/sbin/../conf/:/usr/local/spark/lib/spark-assembly-1.5.2-hadoop2.6.0.jar:/usr/local/spark/lib/datanucleus-core-3.2.10.jar:/usr/local/spark/lib/datanucleus-rdbms-3.2.9.jar:/usr/local/spark/lib/datanucleus-api-jdo-3.2.6.jar:/usr/local/hadoop/etc/hadoop/ -Dspark.cores.max=2 -Xms1g -Xmx1g -XX:MaxPermSize=256m org.apache.spark.deploy.SparkSubmit --master yarn-client --class org.apache.spark.sql.hive.thriftserver.HiveThriftServer2 spark-internal
此次阅读主要关注点在于Thriftserver Application在yarn上的提交(最终8088 WebUI展示的Application名字应该叫做SparkSQL)。
org.apache.spark.deploy.SparkSubmit的main方法
def main(args: Array[String]): Unit = {
//用传递的参数初始化一个SparkSubmitAguments对象
//参数包括
//--master yarn-client --class org.apache.spark.sql.hive.thriftserver.HiveThriftServer2 spark-internal val appArgs = new SparkSubmitArguments(args)
if (appArgs.verbose) {
printStream.println(appArgs)
}
//如果action没有赋值,则默认为SUBMIT;否则,则为得到的action的值
//明显目前是没有赋值的,所以进入submit(appArgs)方法
appArgs.action match {
case SparkSubmitAction.SUBMIT => submit(appArgs)
case SparkSubmitAction.KILL => kill(appArgs)
case SparkSubmitAction.REQUEST_STATUS => requestStatus(appArgs)
}
}
进入submit函数体(第145行)
private[spark] def submit(args: SparkSubmitArguments): Unit = {
//prepareSubmitEnvironment返回一个四元组,包含(子进程参数、子进程类路径列表、系统参数map、子进程主类)
//1.根据master参数确定提交的方式,我们的master参数的值是yarn-client,根据yarn确定了Application运行在yarn上
//2.根据yarn-client的client,知道了deploy的方式是client方式
//3.如果是client方式,则a.得到mainclass的值(此处应为org.apache.spark.sql.hive.thriftserver.HiveThriftServer2 //b.得到所有jar,按逗号split
val (childArgs, childClasspath, sysProps, childMainClass) = prepareSubmitEnvironment(args)
def doRunMain(): Unit = {
if (args.proxyUser != null) {
val proxyUser = UserGroupInformation.createProxyUser(args.proxyUser,
UserGroupInformation.getCurrentUser())
try {
proxyUser.doAs(new PrivilegedExceptionAction[Unit]() {
override def run(): Unit = {
runMain(childArgs, childClasspath, sysProps, childMainClass, args.verbose)
}
})
} catch {
case e: Exception =>
// Hadoop's AuthorizationException suppresses the exception's stack trace, which
// makes the message printed to the output by the JVM not very helpful. Instead,
// detect exceptions with empty stack traces here, and treat them differently.
if (e.getStackTrace().length == 0) {
printStream.println(s"ERROR: ${e.getClass().getName()}: ${e.getMessage()}")
exitFn()
} else {
throw e
}
}
} else {
//利用反射执行相应主类的main方法,并且将参数传入
runMain(childArgs, childClasspath, sysProps, childMainClass, args.verbose)
}
}
// In standalone cluster mode, there are two submission gateways:
// (1) The traditional Akka gateway using o.a.s.deploy.Client as a wrapper
// (2) The new REST-based gateway introduced in Spark 1.3
// The latter is the default behavior as of Spark 1.3, but Spark submit will fail over
// to use the legacy gateway if the master endpoint turns out to be not a REST server.
if (args.isStandaloneCluster && args.useRest) {
try {
printStream.println("Running Spark using the REST application submission protocol.")
doRunMain()
} catch {
// Fail over to use the legacy submission gateway
case e: SubmitRestConnectionException =>
printWarning(s"Master endpoint ${args.master} was not a REST server. " +
"Falling back to legacy submission gateway instead.")
args.useRest = false
submit(args)
}
// In all other modes, just run the main class as prepared
} else {
//看清楚没,其实这个doRunMain方法就是上面定义的内部方法def doRunMain():Unit
doRunMain()
}
}
进入org.apache.spark.sql.hive.thriftserver.HiveThriftServer2的main方法(第51行)
(代码有点难找,其实在spark-parent_2.10/spark-1.3.0/sql/hivethriftserver下)
def main(args: Array[String]) {
//org.apache.hive.service.ServerOptionsProcessor
//这个类在hive源码的service/src/java/org.hive.server下
val optionsProcessor = new ServerOptionsProcessor("HiveThriftServer2")
if (!optionsProcessor.process(args)) {
System.exit(-1)
}
logInfo("Starting SparkContext")
//初始化sparkcontext和hivecontext
SparkSQLEnv.init()
Runtime.getRuntime.addShutdownHook(
new Thread() {
override def run() {
SparkSQLEnv.stop()
}
}
)
try {
//HiveThriftServer2其实是org.apache.hive.service.server.HiveServer2的子类
//将类中的cliService赋值为sparkSqlCliService,HIVE_SERVER2_TRANSPORT_MODE默认为binary(TCP)
val server = new HiveThriftServer2(SparkSQLEnv.hiveContext)
server.init(SparkSQLEnv.hiveContext.hiveconf)
//start时,由于其实是遍历servicelist然后start,所以其实执行了sparkSqlCliService.start()
//但值得注意的是,SparkSQLCLIService是org.apache.hive.service.cli.CLIService的子类
//start方法主要是new了一个HiveMetaStoreClient的对象,它接收了hiveconf作为构造函数参数
server.start()
logInfo("HiveThriftServer2 started")
//启动listener监听
SparkSQLEnv.sparkContext.addSparkListener(new HiveThriftServer2Listener(server))
} catch {
case e: Exception =>
logError("Error starting HiveThriftServer2", e)
System.exit(-1)
}
}
综上可以知道,启动了spark下的thriftserver之后,其实最终是用到了hive中的HiveServer2、HiveMetaStoreClient。
此次阅读主要关注点在于Thriftserver Application在yarn上的提交(最终8088 WebUI展示的Application名字应该叫做SparkSQL)。
org.apache.spark.deploy.SparkSubmit的main方法
def main(args: Array[String]): Unit = {
//用传递的参数初始化一个SparkSubmitAguments对象
//参数包括
//--master yarn-client --class org.apache.spark.sql.hive.thriftserver.HiveThriftServer2 spark-internal val appArgs = new SparkSubmitArguments(args)
if (appArgs.verbose) {
printStream.println(appArgs)
}
//如果action没有赋值,则默认为SUBMIT;否则,则为得到的action的值
//明显目前是没有赋值的,所以进入submit(appArgs)方法
appArgs.action match {
case SparkSubmitAction.SUBMIT => submit(appArgs)
case SparkSubmitAction.KILL => kill(appArgs)
case SparkSubmitAction.REQUEST_STATUS => requestStatus(appArgs)
}
}
进入submit函数体(第145行)
private[spark] def submit(args: SparkSubmitArguments): Unit = {
//prepareSubmitEnvironment返回一个四元组,包含(子进程参数、子进程类路径列表、系统参数map、子进程主类)
//1.根据master参数确定提交的方式,我们的master参数的值是yarn-client,根据yarn确定了Application运行在yarn上
//2.根据yarn-client的client,知道了deploy的方式是client方式
//3.如果是client方式,则a.得到mainclass的值(此处应为org.apache.spark.sql.hive.thriftserver.HiveThriftServer2 //b.得到所有jar,按逗号split
val (childArgs, childClasspath, sysProps, childMainClass) = prepareSubmitEnvironment(args)
def doRunMain(): Unit = {
if (args.proxyUser != null) {
val proxyUser = UserGroupInformation.createProxyUser(args.proxyUser,
UserGroupInformation.getCurrentUser())
try {
proxyUser.doAs(new PrivilegedExceptionAction[Unit]() {
override def run(): Unit = {
runMain(childArgs, childClasspath, sysProps, childMainClass, args.verbose)
}
})
} catch {
case e: Exception =>
// Hadoop's AuthorizationException suppresses the exception's stack trace, which
// makes the message printed to the output by the JVM not very helpful. Instead,
// detect exceptions with empty stack traces here, and treat them differently.
if (e.getStackTrace().length == 0) {
printStream.println(s"ERROR: ${e.getClass().getName()}: ${e.getMessage()}")
exitFn()
} else {
throw e
}
}
} else {
//利用反射执行相应主类的main方法,并且将参数传入
runMain(childArgs, childClasspath, sysProps, childMainClass, args.verbose)
}
}
// In standalone cluster mode, there are two submission gateways:
// (1) The traditional Akka gateway using o.a.s.deploy.Client as a wrapper
// (2) The new REST-based gateway introduced in Spark 1.3
// The latter is the default behavior as of Spark 1.3, but Spark submit will fail over
// to use the legacy gateway if the master endpoint turns out to be not a REST server.
if (args.isStandaloneCluster && args.useRest) {
try {
printStream.println("Running Spark using the REST application submission protocol.")
doRunMain()
} catch {
// Fail over to use the legacy submission gateway
case e: SubmitRestConnectionException =>
printWarning(s"Master endpoint ${args.master} was not a REST server. " +
"Falling back to legacy submission gateway instead.")
args.useRest = false
submit(args)
}
// In all other modes, just run the main class as prepared
} else {
//看清楚没,其实这个doRunMain方法就是上面定义的内部方法def doRunMain():Unit
doRunMain()
}
}
进入org.apache.spark.sql.hive.thriftserver.HiveThriftServer2的main方法(第51行)
(代码有点难找,其实在spark-parent_2.10/spark-1.3.0/sql/hivethriftserver下)
def main(args: Array[String]) {
//org.apache.hive.service.ServerOptionsProcessor
//这个类在hive源码的service/src/java/org.hive.server下
val optionsProcessor = new ServerOptionsProcessor("HiveThriftServer2")
if (!optionsProcessor.process(args)) {
System.exit(-1)
}
logInfo("Starting SparkContext")
//初始化sparkcontext和hivecontext
SparkSQLEnv.init()
Runtime.getRuntime.addShutdownHook(
new Thread() {
override def run() {
SparkSQLEnv.stop()
}
}
)
try {
//HiveThriftServer2其实是org.apache.hive.service.server.HiveServer2的子类
//将类中的cliService赋值为sparkSqlCliService,HIVE_SERVER2_TRANSPORT_MODE默认为binary(TCP)
val server = new HiveThriftServer2(SparkSQLEnv.hiveContext)
server.init(SparkSQLEnv.hiveContext.hiveconf)
//start时,由于其实是遍历servicelist然后start,所以其实执行了sparkSqlCliService.start()
//但值得注意的是,SparkSQLCLIService是org.apache.hive.service.cli.CLIService的子类
//start方法主要是new了一个HiveMetaStoreClient的对象,它接收了hiveconf作为构造函数参数
server.start()
logInfo("HiveThriftServer2 started")
//启动listener监听
SparkSQLEnv.sparkContext.addSparkListener(new HiveThriftServer2Listener(server))
} catch {
case e: Exception =>
logError("Error starting HiveThriftServer2", e)
System.exit(-1)
}
}
综上可以知道,启动了spark下的thriftserver之后,其实最终是用到了hive中的HiveServer2、HiveMetaStoreClient。