/** * The [[RpcEnv]] that this [[RpcEndpoint]] is registered to. */ val rpcEnv: RpcEnv .... }
/** * Process messages from [[RpcEndpointRef.send]] or [[RpcCallContext.reply)]]. If receiving a * unmatched message, [[SparkException]] will be thrown and sent to `onError`. */ defreceive: PartialFunction[Any, Unit] = { case _ => thrownewSparkException(self + " does not implement 'receive'") }
/** * Process messages from [[RpcEndpointRef.ask]]. If receiving a unmatched message, * [[SparkException]] will be thrown and sent to `onError`. */ defreceiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = { case _ => context.sendFailure(newSparkException(self + " won't reply anything")) }
/** * Invoked before [[RpcEndpoint]] starts to handle any message. */ defonStart(): Unit = { // By default, do nothing. }
/** * A separate client factory for file downloads. This avoids using the same RPC handler as * the main RPC context, so that events caused by these clients are kept isolated from the * main RPC traffic. * * It also allows for different configuration of certain properties, such as the number of * connections per peer. */ @volatileprivatevar fileDownloadFactory: TransportClientFactory = _
val timeoutScheduler = ThreadUtils.newDaemonSingleThreadScheduledExecutor("netty-rpc-env-timeout")
// Because TransportClientFactory.createClient is blocking, we need to run it in this thread pool // to implement non-blocking send/ask. // TODO: a non-blocking TransportClientFactory.createClient in future private[netty] val clientConnectionExecutor = ThreadUtils.newDaemonCachedThreadPool( "netty-rpc-connection", conf.getInt("spark.rpc.connect.threads", 64))
@volatileprivatevar server: TransportServer = _
privateval stopped = newAtomicBoolean(false)
/** * A map for [[RpcAddress]] and [[Outbox]]. When we are connecting to a remote [[RpcAddress]], * we just put messages to its [[Outbox]] to implement a non-blocking `send` method. */ // 多个地址对应的发件箱 privateval outboxes = newConcurrentHashMap[RpcAddress, Outbox]()
/** * Remove the address's Outbox and stop it. */ private[netty] defremoveOutbox(address: RpcAddress): Unit = { val outbox = outboxes.remove(address) if (outbox != null) { outbox.stop() } } // 启动TransportServer来接收远程消息 defstartServer(bindAddress: String, port: Int): Unit = { val bootstraps: java.util.List[TransportServerBootstrap] = if (securityManager.isAuthenticationEnabled()) { java.util.Arrays.asList(newSaslServerBootstrap(transportConf, securityManager)) } else { java.util.Collections.emptyList() } server = transportContext.createServer(bindAddress, port, bootstraps) dispatcher.registerRpcEndpoint( RpcEndpointVerifier.NAME, newRpcEndpointVerifier(this, dispatcher)) }
# included in all the spark scripts with sourcecommand # should not be executable directly # also should not be passed any arguments, since we need original $*
# symlink and absolute path should rely on SPARK_HOME to resolve if [ -z "${SPARK_HOME}" ]; then export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" fi
export SPARK_CONF_DIR="${SPARK_CONF_DIR:-"${SPARK_HOME}/conf"}" #设置 SPARK_CONF_DIR 目录 # Add the PySpark classes to the PYTHONPATH: if [ -z "${PYSPARK_PYTHONPATH_SET}" ]; then export PYTHONPATH="${SPARK_HOME}/python:${PYTHONPATH}" export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.4-src.zip:${PYTHONPATH}" export PYSPARK_PYTHONPATH_SET=1 fi export JAVA_HOME=/opt/module/jdk1.8.0_162
# Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. #
# Starts the master on the machine this script is executed on.
if [ -z "${SPARK_HOME}" ]; then export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" fi
# NOTE: This exact class name is matched downstream by SparkSubmit. # Any changes need to be reflected there. CLASS="org.apache.spark.deploy.master.Master" #调用Master
if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then echo "Usage: ./sbin/start-master.sh [options]" pattern="Usage:" pattern+="\|Using Spark's default log4j profile:" pattern+="\|Registered signal handlers for"
# Find the port number for the master if [ "$SPARK_MASTER_PORT" = "" ]; then SPARK_MASTER_PORT=7077 fi
if [ "$SPARK_MASTER_HOST" = "" ]; then case `uname` in (SunOS) SPARK_MASTER_HOST="`/usr/sbin/check-hostname | awk '{print $NF}'`" ;; (*) SPARK_MASTER_HOST="`hostname -f`" ;; esac fi
# Launch the slaves 调用了start-slave.sh "${SPARK_HOME}/sbin/slaves.sh" cd "${SPARK_HOME}" \; "${SPARK_HOME}/sbin/start-slave.sh" "spark://$SPARK_MASTER_HOST:$SPARK_MASTER_PORT"
# Starts a slave on the machine this script is executed on. # # Environment Variables # # SPARK_WORKER_INSTANCES The number of worker instances to run on this # slave. Default is 1. # SPARK_WORKER_PORT The base port number for the first worker. If set, # subsequent workers will increment this number. If #unset, Spark will find a valid port number, but # with no guarantee of a predictable pattern. # SPARK_WORKER_WEBUI_PORT The base port for the web interface of the first # worker. Subsequent workers will increment this # number. Default is 8081.
if [ -z "${SPARK_HOME}" ]; then export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" fi
# NOTE: This exact class name is matched downstream by SparkSubmit. # Any changes need to be reflected there. CLASS="org.apache.spark.deploy.worker.Worker"
# First argument should be the master; we need to store it aside because we may # need to insert arguments between it and the other arguments MASTER=$1 shift
# Determine desired worker port if [ "$SPARK_WORKER_WEBUI_PORT" = "" ]; then SPARK_WORKER_WEBUI_PORT=8081 fi
# Start up the appropriate number of workers on this machine. # quick localfunction to start a worker function start_instance { WORKER_NUM=$1 shift
if [ "$SPARK_WORKER_INSTANCES" = "" ]; then start_instance 1 "$@" else for ((i=0; i<$SPARK_WORKER_INSTANCES; i++)); do start_instance $(( 1 + $i )) "$@" done fi
workerMain方法
1 2 3 4 5 6 7 8
defmain(argStrings: Array[String]) { Utils.initDaemon(log) val conf = newSparkConf val args = newWorkerArguments(argStrings, conf) val rpcEnv = startRpcEnvAndEndpoint(args.host, args.port, args.webUiPort, args.cores, args.memory, args.masters, args.workDir, conf = conf) rpcEnv.awaitTermination() }
任务提交
spark-submit
1 2 3 4 5 6 7 8
if [ -z "${SPARK_HOME}" ]; then source "$(dirname "$0")"/find-spark-home fi
#disable randomized hashfor string in Python 3.3+ export PYTHONHASHSEED=0
if [ -z "${SPARK_HOME}" ]; then source "$(dirname "$0")"/find-spark-home fi
. "${SPARK_HOME}"/bin/load-spark-env.sh
# Find the java binary if [ -n "${JAVA_HOME}" ]; then RUNNER="${JAVA_HOME}/bin/java" else if [ "$(command -v java)" ]; then RUNNER="java" else echo "JAVA_HOME is not set" >&2 exit 1 fi fi
# Find Spark jars. if [ -d "${SPARK_HOME}/jars" ]; then SPARK_JARS_DIR="${SPARK_HOME}/jars" else SPARK_JARS_DIR="${SPARK_HOME}/assembly/target/scala-$SPARK_SCALA_VERSION/jars" fi
if [ ! -d "$SPARK_JARS_DIR" ] && [ -z "$SPARK_TESTING$SPARK_SQL_TESTING" ]; then echo "Failed to find Spark jars directory ($SPARK_JARS_DIR)." 1>&2 echo "You need to build Spark with the target \"package\" before running this program." 1>&2 exit 1 else LAUNCH_CLASSPATH="$SPARK_JARS_DIR/*" fi
# Add the launcher build dir to the classpath if requested. if [ -n "$SPARK_PREPEND_CLASSES" ]; then LAUNCH_CLASSPATH="${SPARK_HOME}/launcher/target/scala-$SPARK_SCALA_VERSION/classes:$LAUNCH_CLASSPATH" fi
# For tests if [[ -n "$SPARK_TESTING" ]]; then unset YARN_CONF_DIR unset HADOOP_CONF_DIR fi
# The launcher library will print arguments separated by a NULL character, to allow arguments with # characters that would be otherwise interpreted by the shell. Read that in a while loop, populating # an array that will be used to exec the final command. # # The exit code of the launcher is appended to the output, so the parent shell removes it from the #command array and checks the value to see if the launcher succeeded. build_command() { "$RUNNER" -Xmx128m -cp "$LAUNCH_CLASSPATH" org.apache.spark.launcher.Main "$@" printf "%d\0" $? }
CMD=() while IFS= read -d '' -r ARG; do CMD+=("$ARG") done < <(build_command "$@")
# Certain JVM failures result in errors being printed to stdout (instead of stderr), which causes # the code that parses the output of the launcher to get confused. In those cases, check if the #exit code is an integer, and if it's not, handle it as a special error case. if ! [[ $LAUNCHER_EXIT_CODE =~ ^[0-9]+$ ]]; then echo "${CMD[@]}" | head -n-1 1>&2 exit 1 fi
if [ $LAUNCHER_EXIT_CODE != 0 ]; then exit $LAUNCHER_EXIT_CODE fi
# SPARK-4161: scala does not assume use of the java classpath, # so we need to add the "-Dscala.usejavacp=true" flag manually. We #do this specifically for the Spark shell because the scala REPL # has its own class loader, and any additional classpath specified # through spark.driver.extraClassPath is not automatically propagated. SPARK_SUBMIT_OPTS="$SPARK_SUBMIT_OPTS -Dscala.usejavacp=true"
function main() { if $cygwin; then # Workaround for issue involving JLine and Cygwin # (see http://sourceforge.net/p/jline/bugs/40/). # If you're using the Mintty terminal emulator in Cygwin, may need to set the # "Backspace sends ^H" setting in "Keys" section of the Mintty options # (see https://github.com/sbt/sbt/issues/562). stty -icanon min 1 -echo > /dev/null 2>&1 export SPARK_SUBMIT_OPTS="$SPARK_SUBMIT_OPTS -Djline.terminal=unix" "${SPARK_HOME}"/bin/spark-submit --class org.apache.spark.repl.Main --name "Spark shell" "$@" stty icanon echo > /dev/null 2>&1 else export SPARK_SUBMIT_OPTS "${SPARK_HOME}"/bin/spark-submit --class org.apache.spark.repl.Main --name "Spark shell" "$@" fi }
# Copy restore-TTY-on-exit functions from Scala script so spark-shell exits properly even in # binary distribution of Spark where Scala is not installed exit_status=127 saved_stty=""
// Visible for testing private[repl] defdoMain(args: Array[String], _interp: SparkILoop): Unit = { interp = _interp val jars = Utils.getUserJars(conf, isShell = true).mkString(File.pathSeparator) val interpArguments = List( "-Yrepl-class-based", "-Yrepl-outdir", s"${outputDir.getAbsolutePath}", "-classpath", jars ) ++ args.toList
val settings = newGenericRunnerSettings(scalaOptionError) settings.processArguments(interpArguments, true)
if (!hasErrors) { interp.process(settings) // Repl starts and goes in loop of R.E.P.L Option(sparkContext).map(_.stop) } }
defcreateSparkSession(): SparkSession = { val execUri = System.getenv("SPARK_EXECUTOR_URI") conf.setIfMissing("spark.app.name", "Spark shell") // SparkContext will detect this configuration and register it with the RpcEnv's // file server, setting spark.repl.class.uri to the actual URI for executors to // use. This is sort of ugly but since executors are started as part of SparkContext // initialization in certain cases, there's an initialization order issue that prevents // this from being set after SparkContext is instantiated. conf.set("spark.repl.class.outputDir", outputDir.getAbsolutePath()) if (execUri != null) { conf.set("spark.executor.uri", execUri) } if (System.getenv("SPARK_HOME") != null) { conf.setSparkHome(System.getenv("SPARK_HOME")) }
val builder = SparkSession.builder.config(conf) if (conf.get(CATALOG_IMPLEMENTATION.key, "hive").toLowerCase == "hive") { if (SparkSession.hiveClassesArePresent) { // In the case that the property is not set at all, builder's config // does not have this value set to 'hive' yet. The original default // behavior is that when there are hive classes, we use hive catalog. sparkSession = builder.enableHiveSupport().getOrCreate() logInfo("Created Spark session with Hive support") } else { // Need to change it back to 'in-memory' if no hive classes are found // in the case that the property is set to hive in spark-defaults.conf builder.config(CATALOG_IMPLEMENTATION.key, "in-memory") sparkSession = builder.getOrCreate() logInfo("Created Spark session") } } else { // In the case that the property is set but not to 'hive', the internal // default is 'in-memory'. So the sparkSession will use in-memory catalog. sparkSession = builder.getOrCreate() logInfo("Created Spark session") } sparkContext = sparkSession.sparkContext sparkSession }
/** Print a welcome message */ defprintWelcome() { echo("""Welcome to ____ __ / __/__ ___ _____/ /__ _\ \/ _ \/ _ `/ __/ '_/ /___/ .__/\_,_/_/ /_/\_\ version %s /_/ """.format(SPARK_VERSION)) importProperties._ val welcomeMsg = "Using Scala %s (%s, Java %s)".format( versionString, javaVmName, javaVersion) echo(welcomeMsg) echo("Type in expressions to have them evaluated.") echo("Type :help for more information.") }
protecteddefasyncMessage(msg: String) { if (isReplInfo || isReplPower) echoAndRefresh(msg) }
privateval initLock = new java.util.concurrent.locks.ReentrantLock() privateval initCompilerCondition = initLock.newCondition() // signal the compiler is initialized privateval initLoopCondition = initLock.newCondition() // signal the whole repl is initialized privateval initStart = System.nanoTime
privatedefwithLock[T](body: => T): T = { initLock.lock() try body finally initLock.unlock() } // a condition used to ensure serial access to the compiler. @volatileprivatevar initIsComplete = false @volatileprivatevar initError: String = null privatedefelapsed() = "%.3f".format((System.nanoTime - initStart).toDouble / 1000000000L)
// the method to be called when the interpreter is initialized. // Very important this method does nothing synchronous (i.e. do // not try to use the interpreter) because until it returns, the // repl's lazy val `global` is still locked. protecteddefinitializedCallback() = withLock(initCompilerCondition.signal())
// Spins off a thread which awaits a single message once the interpreter // has been initialized. protecteddefcreateAsyncListener() = { io.spawn { withLock(initCompilerCondition.await()) asyncMessage("[info] compiler init time: " + elapsed() + " s.") postInitialization() } }
// called from main repl loop protecteddefawaitInitialized(): Boolean = { if (!initIsComplete) withLock { while (!initIsComplete) initLoopCondition.await() } if (initError != null) { // scalastyle:off println println(""" |Failed to initialize the REPL due to an unexpected error. |This is a bug, please, report it along with the error diagnostics printed below. |%s.""".stripMargin.format(initError) ) // scalastyle:on println false } elsetrue } // private def warningsThunks = List( // () => intp.bind("lastWarnings", "" + typeTag[List[(Position, String)]], intp.lastWarnings _), // )
protecteddefpostInitThunks= List[Option[() => Unit]]( Some(intp.setContextClassLoader _), if (isReplPower) Some(() => enablePowerMode(true)) elseNone ).flatten // ++ ( // warningsThunks // ) // called once after init condition is signalled protecteddefpostInitialization() { try { postInitThunks foreach (f => addThunk(f())) runThunks() } catch { case ex: Throwable => initError = stackTraceString(ex) throw ex } finally { initIsComplete = true
if (isAsync) { asyncMessage("[info] total init time: " + elapsed() + " s.") withLock(initLoopCondition.signal()) } } }
definitializeSpark() { intp.beQuietDuring { command(""" @transient val spark = org.apache.spark.repl.Main.interp.createSparkSession() @transient val sc = { val _sc = spark.sparkContext if (_sc.getConf.getBoolean("spark.ui.reverseProxy", false)) { val proxyUrl = _sc.getConf.get("spark.ui.reverseProxyUrl", null) if (proxyUrl != null) { println(s"SparkContextWebUI is available at ${proxyUrl}/proxy/${_sc.applicationId}") } else { println(s"SparkContextWebUI is available at SparkMasterPublicURL") } } else { _sc.uiWebUrl.foreach { webUrl => println(s"Spark context WebUI available at ${webUrl}") } } println("Spark context available as 'sc' " + s"(master = ${_sc.master}, app id = ${_sc.applicationId}).") println("Spark session available as 'spark'.") _sc } """) command("import org.apache.spark.SparkContext._") command("import spark.implicits._") command("import spark.sql") command("import org.apache.spark.sql.functions._") } }
// code to be executed only after the interpreter is initialized // and the lazy val `global` can be accessed without risk of deadlock. privatevar pendingThunks: List[() => Unit] = Nil protecteddefaddThunk(body: => Unit) = synchronized { pendingThunks :+= (() => body) } protecteddefrunThunks(): Unit = synchronized { if (pendingThunks.nonEmpty) logDebug("Clearing " + pendingThunks.size + " thunks.")
while (pendingThunks.nonEmpty) { val thunk = pendingThunks.head pendingThunks = pendingThunks.tail thunk() } } }