第一
spark版本为2.1.0
1,sparkconf类是用来设置spark的运行时参数的,所有的设置,无论是在配置文件中配置的参数,还是在程序中指定的参数,都要走此类。
2,代码中,获取参数主要有两个途径,分别是直接调用set方法,和直接从配置文本中读取,具体代码作用看注释
3,这个类的主要功能,就是将各种配置信息,封装在类中setting中(hashmap表)
3,代码并不是此类的全部代码,只是一部分我用到的方法
//主构造器的参数为loadDefaults,如果为true,则代表spark会读取系统的配置信息
class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Serializable {
import SparkConf._ //导包
// 这个是一个辅助构造器,会调用主构造器,默认传进主构造器的参数为true,因此程序会默认的读取系统配置文件的属性
def this() = this(true)
// 创建一个用于存放设置的map集合,直接用的java集合。因为所有的属性都是加载后放入此map的
// 因此先加载的配置文件的属性会被后面在程序中定义的属性所覆盖。所以就有了属性配置的优先级
private val settings = new ConcurrentHashMap[String, String]()
//-----------------------还没搞明白-----------------------------------
@transient private lazy val reader: ConfigReader = {
val _reader = new ConfigReader(new SparkConfigProvider(settings))
_reader.bindEnv(new ConfigProvider {
override def get(key: String): Option[String] = Option(getenv(key))
})
_reader
}
// 判断主构造器的参数,因为是被辅助构造器调用了,所以已经是true → 加载本地文件属性
if (loadDefaults) {
loadFromSystemProperties(false)
}
private[spark] def loadFromSystemProperties(silent: Boolean): SparkConf = {
// Load any spark.* system properties,使用util包下的方法,获取系统属性,然后使用
// 带有过滤器的for循环遍历出key以spark.开头的kv属性。然后使用set方法,将属性值放入map中
for ((key, value) <- Utils.getSystemProperties if key.startsWith("spark.")) {
set(key, value, silent)
}
this
}
// 对外暴露的set方法,用于通过set设置参数,返回的是一个sparkconf的对象
def set(key: String, value: String): SparkConf = {
set(key, value, false)
}
// 这个是主要的set方法,用于返回conf对象,其他的都是调用这个方法。
private[spark] def set(key: String, value: String, silent: Boolean): SparkConf = {
if (key == null) {
throw new NullPointerException("null key")// 抛出异常
}
if (value == null) {
throw new NullPointerException("null value for " + key)
}
if (!silent) {
logDeprecationWarning(key) //由方法名来看,应该是和log信息有关
}
settings.put(key, value)
this
}
private[spark] def set[T](entry: ConfigEntry[T], value: T): SparkConf = {
set(entry.key, entry.stringConverter(value))
this
}
private[spark] def set[T](entry: OptionalConfigEntry[T], value: T): SparkConf = {
set(entry.key, entry.rawStringConverter(value))
this
}
/**
* The master URL to connect to, such as "local" to run locally with one thread, "local[4]" to
* run locally with 4 cores, or "spark://master:7077" to run on a Spark standalone cluster.
*/
def setMaster(master: String): SparkConf = {
set("spark.master", master) //也是调用的set重载方法
}
/** Set a name for your application. Shown in the Spark web UI. */
def setAppName(name: String): SparkConf = {
set("spark.app.name", name)
}
/** Set JAR files to distribute to the cluster. */
def setJars(jars: Seq[String]): SparkConf = {
for (jar <- jars if (jar == null)) logWarning("null jar passed to SparkContext constructor")
set("spark.jars", jars.filter(_ != null).mkString(","))
}
/** Set JAR files to distribute to the cluster. (Java-friendly version.) */
def setJars(jars: Array[String]): SparkConf = {
setJars(jars.toSeq)
}
/**
* Set an environment variable to be used when launching executors for this application.
* These variables are stored as properties of the form spark.executorEnv.VAR_NAME
* (for example spark.executorEnv.PATH) but this method makes them easier to set.
*/
def setExecutorEnv(variable: String, value: String): SparkConf = {
set("spark.executorEnv." + variable, value)
}
/**
* Set multiple environment variables to be used when launching executors.
* These variables are stored as properties of the form spark.executorEnv.VAR_NAME
* (for example spark.executorEnv.PATH) but this method makes them easier to set.
*/
def setExecutorEnv(variables: Seq[(String, String)]): SparkConf = {
for ((k, v) <- variables) {
setExecutorEnv(k, v)
}
this
}
/**
* Use Kryo serialization and register the given set of classes with Kryo.
* If called multiple times, this will append the classes from all calls together.
*这个是把类序列化后给注册到sparkconf中
*/
def registerKryoClasses(classes: Array[Class[_]]): SparkConf = {
val allClassNames = new LinkedHashSet[String]() // 先创建一个set集合,用于保存数据
// 从setting中获取已经存在的序列化类的数据,并加入set集合中,如果不存在,就返回默认值“ ”
allClassNames ++= get("spark.kryo.classesToRegister", "").split(',').map(_.trim)
.filter(!_.isEmpty)
allClassNames ++= classes.map(_.getName) // 往set集合添加新的类数据
set("spark.kryo.classesToRegister", allClassNames.mkString(",")) //再从新添加会setting集合中
set("spark.serializer", classOf[KryoSerializer].getName) //设置序列化方式
this
}
//后面还有代码,不过都大部分类似,将配置信息拿出来封装在setting的map表中