测试案例类
case class Player(name:String,age:Int,gender:String)
引入相关pom
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.3.2-mdh1.0.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.3.2-mdh1.0.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>2.3.2-mdh1.0.0-SNAPSHOT</version>
</dependency>
1、自建rdd,并完成rdd-dataframe-dataset之间的相互转化
def method1():Unit={
//master在本地设置为local,在线上集群应该设置为yarn
val spark: SparkSession = SparkSession.builder().appName("test1").master("local[*]").getOrCreate()
val sparkContext: SparkContext = spark.sparkContext
//自建rdd,利用List区分行条目,利用元组区分列信息,自建rdd的api makeRDD实际调用的是parallelize,parallelize即将集合创建为rdd
val rdd1: RDD[(String, Int, String)] = sparkContext.makeRDD(List(("jack", 11, "male"), ("lisa", 12, "female")))
rdd1.foreach(println)
//隐式参数
import spark.implicits._
//rdd转dataframe,toDF内可以指定列名
val df1: DataFrame = rdd1.toDF("name","age","gender")
df1.show()
//把rdd的元组映射为案例类,然后转为dataset
val ds1: Dataset[Player] = rdd1.map(x => Player(x._1, x._2, x._3)).toDS()
ds1.show()
//把dataframe、dataset转为rdd后逐条打印
df1.rdd.foreach(println)
ds1.rdd.foreach(println)
//dataframe转为dataset
val ds2: Dataset[Player] = df1.as[Player]
ds2.show()
//dataset转为dataframe,再次用toDF指定新的列名
val df2: DataFrame = ds1.toDF("nm", "ag", "sex")
df2.show()
}
2、利用rdd读取本地文件,实现WordCount
def method2():Unit={
val sparkConf: SparkConf = new SparkConf().setAppName("test2").setMaster("local[*]")
val sparkContext: SparkContext = new SparkContext(sparkConf)
//textFile与parrallelize相对,是从外部读取文件创建rdd,其中rdd把文件按照行来区分条目
val rddFile: RDD[String] = sparkContext.textFile("src/main/resources/myfile/goodjob.txt")
//对rdd内每条数据split按照空格拆分为数组,对rdd数据直接flatMap拍平为单词rdd集,然后map创建元组,在对元组reduceByKey以_1为基准,相同则把_2的值相加,得到最终结果
val rddResult: RDD[(String, Int)] = rddFile.flatMap(_.split(" ")).map((_, 1)).reduceByKey(_ + _)
rddResult.foreach(println)
//saveAsTextFile写入文件
rddResult.saveAsTextFile("src/main/resources/myfile/result.txt")
}
3、spark操作Hive
def method3():Unit={
//注意spark操作hive时,需要加上enableHiveSupport
val sparkSession: SparkSession = SparkSession.builder().appName("test3").master("local[*]").enableHiveSupport().getOrCreate()
//先创建hive表player和player2
sparkSession.sql("create table meta.player(name string,age int,gender string);")
sparkSession.sql("create table meta.player2(name string,age int,gender string);")
//新增数据到player
sparkSession.sql("insert into table meta.player values('wangming',11,'male');")
sparkSession.sql("insert into table meta.player values('yuki',12,'female');")
sparkSession.sql("insert into table meta.player values('lili',13,'female');")
//测试故意修改列名
val df: DataFrame = sparkSession.sql("select * from meta.player where age < 12").toDF("a","b","c")
//利用dataframe创建临时会话视图,后续sql的执行,将在此视图上执行 createOrReplaceTempView->在 Spark Session 中使用 SparkDataFrame 创建一个新的临时视图。如果已存在同名的临时视图,则替换它。
df.createOrReplaceTempView("tv1")
sparkSession.sql("insert into meta.player2(name,age,gender) select a,b,c from tv1")
}
4、spark操作Mysql
def method4():Unit={
val sparkSession: SparkSession = SparkSession.builder().appName("test3").master("local[*]").enableHiveSupport().getOrCreate()
val df1: DataFrame = sparkSession.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306/my_test").option("user", "root").option("password", "123456").option("dbtable", "ui").load()
df1.createOrReplaceTempView("tv1")
val df2: DataFrame = sparkSession.sql("select * from tv1 where age <20")
df2.write.mode("append").format("jdbc").option("url", "jdbc:mysql://localhost:3306/my_test?useUnicode=true&characterEncoding=utf8&useSSL=false&zeroDateTimeBehavior=convertToNull&jdbcCompliantTruncation=false").option("user", "root").option("password", "123456").option("dbtable","ui2").save()
}