1.1 Demo
以一个Spark-Sql的例子开始:
public class TestSparkSql {
public static void main(String[] args) {
Logger log = Logger.getLogger(TestSparkSql.class);
System.setProperty("javax.xml.parsers.DocumentBuilderFactory",
"com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderFactoryImpl");
System.setProperty("javax.xml.parsers.SAXParserFactory",
"com.sun.org.apache.xerces.internal.jaxp.SAXParserFactoryImpl");
String sparkMaster = Configure.instance.get("sparkMaster");
String sparkJarAddress = Configure.instance.get("sparkJarAddress");
String sparkExecutorMemory = Configure.instance.get("sparkExecutorMemory");
String sparkCoresMax = Configure.instance.get("sparkCoresMax");
String sparkLocalDir = Configure.instance.get("sparkLocalDir");
log.info("initialize parameters");
log.info("sparkMaster:" + sparkMaster);
log.info("sparkJarAddress:" + sparkJarAddress);
log.info("sparkExecutorMemory:" + sparkExecutorMemory);
log.info("sparkCoresMax:" + sparkCoresMax);
log.info("sparkLocalDir:" + sparkLocalDir);
SparkConf sparkConf = new SparkConf().setAppName("dse load application in Java");
sparkConf.setMaster(sparkMaster);
if (!sparkJarAddress.isEmpty() && !sparkMaster.contains("local")) {
sparkConf.set("spark.executor.memory", sparkExecutorMemory); // 16g
sparkConf.set("spark.scheduler.mode", "FAIR");
sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
sparkConf.set("spark.kryo.registrator", "com.dahua.dse3.driver.dataset.DseKryoRegistrator");
sparkConf.set("spark.cores.max", sparkCoresMax);
sparkConf.set("spark.akka.threads", "12");
sparkConf.set("spark.local.dir", sparkLocalDir);
sparkConf.set("spark.shuffle.manager", "SORT");
sparkConf.set("spark.network.timeout", "120");
sparkConf.set("spark.rpc.lookupTimeout", "120");
sparkConf.set("spark.executor.extraClassPath", "/usr/dahua/spark/executelib/hbase-protocol-0.98.3-hadoop2.jar");
sparkConf.set("spark.executor.extraJavaOptions", "-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps");
sparkConf.set("spark.sql.codegen", "TRUE");
//sparkConf.set("spark.sql.parquet.filterPushdown","true");
}
JavaSparkContext jsc = new JavaSparkContext(sparkConf);
if (!sparkJarAddress.isEmpty() && !sparkMaster.contains("local")) {
jsc.addJar(sparkJarAddress);
}
String hdfsPath = "hdfs://mycluster/wl/parquet/test/2016-06-21";
String source = "test";
SQLContext sqlContext = new SQLContext(jsc);
DataFrame dataFrame = sqlContext.parquetFile(hdfsPath);
dataFrame.registerTempTable(source);
String sql = "SELECT id,dev_chnid,dev_chnname,car_num,car_speed,car_direct from test";
DataFrame result = sqlContext.sql(sql);
log.info("Result:"+result.count());
}
}
当执行result.count()就会触发客户端这边提交Job进行计算,先来看下关键日志打印(修改过源码方便日志打印):
16-07-08 17:19:46,080 INFO org.apache.spark.sql.SQLContext(Logging.scala:59) ## ----------------------parseSql start--------------------------
16-07-08 17:19:46,080 INFO org.apache.spark.sql.SQLContext(Logging.scala:59) ##
[SELECT id,dev_chnid,dev_chnname,car_num,car_speed,car_direct from test]
16-07-08 17:19:46,728 INFO org.apache.spark.sql.SQLContext(Logging.scala:59) ## ----------------------parseSql end --------------------------
16-07-08 17:19:46,738 INFO org.apache.spark.sql.SQLContext(Logging.scala:59) ##
['Project ['id,'dev_chnid,'dev_chnname,'car_num,'car_speed,'car_direct]
'UnresolvedRelation [test], None
]
……
16-07-08 17:29:28,651 INFO org.apache.spark.scheduler.TaskSchedulerImpl(Logging.scala:59) ## Removed TaskSet 1.0, whose tasks have all completed, from pool default
16-07-08 17:29:28,661 INFO org.apache.spark.scheduler.DAGScheduler(Logging.scala:59) ## Job 0 finished: count at TestSparkSql.java:64, took 11.098610 s
[== Parsed Logical Plan ==
Aggregate [COUNT(1) AS count#43L]
Project [id#0L,dev_chnid#26,dev_chnname#4,car_num#5,car_speed#8,car_direct#12]
Subquery test
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code