摘 自: http://bbs.easysoo.cn/?p=463
本节主要讲解spark加载不同的文件格式的方法,并且从local模式和cluster模式进行说明。
1、加载txt文件
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 | /** * txt格式:以空格分割 */ import org.apache.spark.{SparkContext, SparkConf} object WordCount { def main(args: Array[String]) { val conf = new SparkConf().setAppName("txtwd") val path = "file:///usr/local/spark/sbin/p.txt" //local模式 //val path = "hdfs://usr/spark/p.txt" //集群模式 val sc = new SparkContext(conf) sc.textFile(path).flatMap(_.split(" ")).map(x => (x, 1)) .reduceByKey(_ + _).take(10).foreach(println) sc.stop() } } |
2、加载json文件
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 | /** * json格式:{"name":"xxx","age":12} */ import org.apache.spark.SparkContext object TestSql { def main(args: Array[String]) { val sc = new SparkContext() val sqlContext = new org.apache.spark.sql.SQLContext(sc) val path = "file:///usr/local/spark/sbin/p.json"//local模式 //val path = "hdfs://usr/spark/p.json" //集群模式 val people = sqlContext.jsonFile(path) people printSchema() people.registerTempTable("people") sqlContext.sql("select name,age from people where age > 10").collect.foreach(println) } } |