自定义分区器
package com.example.demoscala.controller
import org.apache.spark.rdd.RDD
import org.apache.spark.{HashPartitioner, Partition, Partitioner, SparkConf, SparkContext}
object Test2 {
def main(args: Array[String]): Unit = {
var sparkConf=new SparkConf().setMaster("local")
.setAppName("test");
var sparkContext=new SparkContext(sparkConf);
var rdd=sparkContext.makeRDD(List(("aa","1111111"),("bb","2222"),("cc","33333"),("aa","444444444")));
//自定义分区器
var aa=rdd.partitionBy(new MyPartitioner());
//存放位置
aa.saveAsTextFile("wenjian2");
sparkContext.stop();
}
//自定义分区器
class MyPartitioner extends Partitioner{
//分3个区
override def numPartitions: Int = 3
//按照key 进行分区 返回对应的索引 默认从0开始
override def getPartition(key: Any): Int = {
if(key=="aa"){
0
}else if(key=="bb"){
1
}else{
2
}
}
}
}
就是分了3个区,aa一个区,bb一个区,cc一个区
spark sql
pom引入
<dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-sql_2.12</artifactId> <version>3.0.0</version> </dependency>
bb.json
注意必须是压缩好的内容,因为spark sql会对空格 进行处理 否则报错
[{"name":"张三","age":12},{"name":"李四","age":13},{"name":"王五","age":14}]
DataFrame
package com.example.demoscala.controller
import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, SparkSession}
object Test2 {
def main(args: Array[String]): Unit = {
var sparkConf=new SparkConf().setMaster("local").setAppName("test");
//创建SparkSession对象
var sp=SparkSession.builder().config(sparkConf).getOrCreate();
//读取json文件
var df:DataFrame=sp.read.json("wenjian/bb.json");
//显示json文件的内容
df.show();
//关闭SparkSession
sp.close();
}
}
相当于打印了sql的内容
sql语法
package com.example.demoscala.controller
import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, SparkSession}
object Test2 {
def main(args: Array[String]): Unit = {
var sparkConf=new SparkConf().setMaster("local").setAppName("test");
//创建SparkSession对象
var sp=SparkSession.builder().config(sparkConf).getOrCreate();
//读取json文件
var df:DataFrame=sp.read.json("wenjian/bb.json");
//创建临时表
df.createOrReplaceTempView("user");
//查询临时表
var aa=sp.sql("select * from user");
//显示内容
aa.show();
//创建全局临时表 可以跨session
df.createGlobalTempView("qj");
//全局临时表 必须global_temp.表 才能查询
var bb=sp.sql("select * from global_temp.qj");
//显示内容
bb.show();
//关闭SparkSession
sp.close();
}
}
DSL语法
package com.example.demoscala.controller
import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, SparkSession}
object Test2 {
def main(args: Array[String]): Unit = {
var sparkConf=new SparkConf().setMaster("local").setAppName("test");
//创建SparkSession对象
val sp=SparkSession.builder().config(sparkConf).getOrCreate();
//读取json文件
var df:DataFrame=sp.read.json("wenjian/bb.json");
//打印表结构
df.printSchema();
//显示内容
df.select("age","name").show();
//运算操作都需要$ 和在开头定义 import sp.implicits._
//sp就是我们的SparkSession创建的对象
import sp.implicits._
//年龄加1
df.select($"age"+1).show();
//年龄大于13岁
df.select($"age">13).show();
//分组 查询数据条数
df.groupBy("age").count().show();
//关闭SparkSession
sp.close();
}
}
类型转换
创建类
case class User(id:Int,name:String,age:Int)
package com.example.demoscala.controller
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
object Test2 {
def main(args: Array[String]): Unit = {
var sparkConf=new SparkConf().setMaster("local").setAppName("test");
//创建SparkSession对象
val sp=SparkSession.builder().config(sparkConf).getOrCreate();
//类型转换的时候 也要加下面的代码
import sp.implicits._
//创建rdd
val rdd:RDD[(Int,String,Int)]=sp.sparkContext.makeRDD(List((1,"张三",100),(2,"李四",200),(3,"王五",300)));
//rdd转换DataFrame
val df:DataFrame=rdd.toDF("id","name","age");
df.show();
//DataFrame转Dataset
val ds:Dataset[User]=df.as[User];
ds.show();
//DataSet转DataFrame
val df2=ds.toDF();
df2.show();
//DataFrame 转rdd
val rdd2:RDD[Row]=df2.rdd;
rdd2.foreach(println);
//rdd转Dataset
val ds2=rdd.map{
case (id,name,age)=>User(id, name, age)
}.toDS();
ds2.show();
//Dataset转rdd
val rdd3=ds2.rdd
rdd3.foreach(println);
//关闭SparkSession
sp.close();
}
}
自定义函数
package com.example.demoscala.controller
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
object Test2 {
def main(args: Array[String]): Unit = {
var sparkConf=new SparkConf().setMaster("local").setAppName("test");
//创建SparkSession对象
val sp=SparkSession.builder().config(sparkConf).getOrCreate();
//读取json文件
val df=sp.read.json("wenjian/bb.json")
//创建临时表
df.createOrReplaceTempView("user");
//定义前缀函数
sp.udf.register("aa",(name:String)=>{
"前缀:"+name
});
//给name加一个前缀 需要使用自定义的aa函数
sp.sql("select age,aa(name) from user").show();
//关闭SparkSession
sp.close();
}
}
操作mysql
CREATE TABLE `user` (
`id` int NOT NULL AUTO_INCREMENT,
`name` varchar(255) DEFAULT NULL,
`updatetime` datetime DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=5 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
pom引入
<dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>8.0.31</version> </dependency>
package com.example.demoscala.controller
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Dataset, Row, SaveMode, SparkSession}
object Test2 {
def main(args: Array[String]): Unit = {
var sparkConf=new SparkConf().setMaster("local").setAppName("test");
//创建SparkSession对象
val sp=SparkSession.builder().config(sparkConf).getOrCreate();
//显示mysql的user表,前提是创建好表
sp.read.format("jdbc")
.option("url", "jdbc:mysql://localhost:3306/dmg1")
.option("driver", "com.mysql.jdbc.Driver")
.option("user", "root")
.option("password", "123456")
.option("dbtable", "user")
.load().show
import sp.implicits._
//创建user的rdd
var rdd=sp.sparkContext.makeRDD(List(User(1,"张三",100),User(2,"李四",200),User(3,"王五",300)))
var ds=rdd.toDS();
//创建user1表
ds.write
.format("jdbc")
.option("url", "jdbc:mysql://localhost:3306/dmg1")
.option("driver", "com.mysql.jdbc.Driver")
.option("user", "root")
.option("password", "123456")
.option("dbtable", "user1")
.mode(SaveMode.Append)
.save()
//关闭SparkSession
sp.close();
}
}