大数据:
Spark Core
(四)用
LogQuery
的例子来说明
Executor
是如何运算
RDD
的算子
1.
究竟是怎么运行的?
很多的博客里大量的讲了什么是
RDD, Dependency, Shuffle...
但是究竟那些
Executor
是怎么
运行你提交的代码段的?
下面是一个日志分析的例子,来自
Spark
的
example
[plain] view plain copy
def main(args: Array[String]) {
val sparkConf = new SparkConf().setAppName("Log Query")
val sc = new SparkContext(sparkConf)
val dataSet =
if (args.length == 1) sc.textFile(args(0)) else sc.parallelize(exampleApacheLogs)
// scalastyle:off
val apacheLogRegex =
"""^([\d.]+) (\S+) (\S+)
([\w\d:/]+\s[+\-]\d4)
"(.+?)" (\d{3}) ([\d\-]+) "([^"]+)" "([^"]+)".*""".r
// scalastyle:on
/** Tracks the total query count and number of aggregate bytes for a particular group. */
class Stats(val count: Int, val numBytes: Int) extends Serializable {
def merge(other: Stats): Stats = {
new Stats(count + other.count, numBytes + other.numBytes)
}
override def toString: String = "bytes=%s\tn=%s".format(numBytes, count)
}
def extractKey(line: String): (String, String, String) = {
apacheLogRegex.findFirstIn(line) match {
case Some(apacheLogRegex(ip, _, user, dateTime, query, status, bytes, referer, ua)) =>
if (user != "\"-\"") (ip, user, query)
else (null, null, null)
case _ => (null, null, null)
}
}
def extractStats(line: String): Stats = {
apacheLogRegex.findFirstIn(line) match {
case Some(apacheLogRegex(ip, _, user, dateTime, query, status, bytes, referer, ua)) =>
new Stats(1, bytes.toInt)
case _ => new Stats(1, 0)