sparkSql的使用案例。。。。。
1、创建DataFrame
val rdd = sc.makeRDD(List((1,“zhang”,19,“bj”,1000000), (2,“wang”,29,“sh”,100),(3,“li”,49,“sz”,999)));
val df = rdd.toDF(“id”,“name”,“age”,“addr”,“salary”);
df.show()
(1)查询
df.select(“id”,“name”).show();
(2)带条件的查询
df.select(
"
i
d
"
,
"id",
"id",“name”).where(
"
n
a
m
e
"
=
=
=
"
b
"
)
.
s
h
o
w
(
)
(
3
)
排
序
查
询
o
r
d
e
r
B
y
(
"name" === "b").show() (3)排序查询 orderBy(
"name"==="b").show()(3)排序查询orderBy(“列名”) 升序排列
orderBy(
"
列
名
"
.
d
e
s
c
)
降
序
排
列
o
r
d
e
r
B
y
(
"列名".desc) 降序排列 orderBy(
"列名".desc)降序排列orderBy(“列1” ,
"
列
2
"
.
d
e
s
c
)
按
两
列
排
序
d
f
.
s
e
l
e
c
t
(
"列2".desc) 按两列排序 df.select(
"列2".desc)按两列排序df.select(“id”,
"
n
a
m
e
"
)
.
o
r
d
e
r
B
y
(
"name").orderBy(
"name").orderBy(“name”.desc).show
df.select(
"
i
d
"
,
"id",
"id",“name”).sort(
"
n
a
m
e
"
.
d
e
s
c
)
.
s
h
o
w
(
4
)
分
组
查
询
g
r
o
u
p
B
y
(
"
列
名
"
,
.
.
.
)
.
m
a
x
(
列
名
)
求
最
大
值
g
r
o
u
p
B
y
(
"
列
名
"
,
.
.
.
)
.
m
i
n
(
列
名
)
求
最
小
值
g
r
o
u
p
B
y
(
"
列
名
"
,
.
.
.
)
.
a
v
g
(
列
名
)
求
平
均
值
g
r
o
u
p
B
y
(
"
列
名
"
,
.
.
.
)
.
s
u
m
(
列
名
)
求
和
g
r
o
u
p
B
y
(
"
列
名
"
,
.
.
.
)
.
c
o
u
n
t
(
)
求
个
数
g
r
o
u
p
B
y
(
"
列
名
"
,
.
.
.
)
.
a
g
g
可
以
将
多
个
方
法
进
行
聚
合
v
a
l
r
d
d
=
s
c
.
m
a
k
e
R
D
D
(
L
i
s
t
(
(
1
,
"
a
"
,
"
b
j
"
)
,
(
2
,
"
b
"
,
"
s
h
"
)
,
(
3
,
"
c
"
,
"
g
z
"
)
,
(
4
,
"
d
"
,
"
b
j
"
)
,
(
5
,
"
e
"
,
"
g
z
"
)
)
)
;
v
a
l
d
f
=
r
d
d
.
t
o
D
F
(
"
i
d
"
,
"
n
a
m
e
"
,
"
a
d
d
r
"
)
;
d
f
.
g
r
o
u
p
B
y
(
"
a
d
d
r
"
)
.
c
o
u
n
t
(
)
.
s
h
o
w
(
)
(
5
)
连
接
查
询
v
a
l
d
e
p
t
=
s
c
.
p
a
r
a
l
l
e
l
i
z
e
(
L
i
s
t
(
(
100
,
"
财
务
部
"
)
,
(
200
,
"
研
发
部
"
)
)
)
.
t
o
D
F
(
"
d
e
p
t
i
d
"
,
"
d
e
p
t
n
a
m
e
"
)
v
a
l
e
m
p
=
s
c
.
p
a
r
a
l
l
e
l
i
z
e
(
L
i
s
t
(
(
1
,
100
,
"
张
财
务
"
)
,
(
2
,
100
,
"
李
会
计
"
)
,
(
3
,
200
,
"
王
艳
发
"
)
)
)
.
t
o
D
F
(
"
i
d
"
,
"
d
i
d
"
,
"
n
a
m
e
"
)
d
e
p
t
.
j
o
i
n
(
e
m
p
,
"name".desc).show (4)分组查询 groupBy("列名", ...).max(列名) 求最大值 groupBy("列名", ...).min(列名) 求最小值 groupBy("列名", ...).avg(列名) 求平均值 groupBy("列名", ...).sum(列名) 求和 groupBy("列名", ...).count() 求个数 groupBy("列名", ...).agg 可以将多个方法进行聚合 val rdd = sc.makeRDD(List((1,"a","bj"),(2,"b","sh"),(3,"c","gz"),(4,"d","bj"),(5,"e","gz"))); val df = rdd.toDF("id","name","addr"); df.groupBy("addr").count().show() (5)连接查询 val dept=sc.parallelize(List((100,"财务部"),(200,"研发部"))).toDF("deptid","deptname") val emp=sc.parallelize(List((1,100,"张财务"),(2,100,"李会计"),(3,200,"王艳发"))).toDF("id","did","name") dept.join(emp,
"name".desc).show(4)分组查询groupBy("列名",...).max(列名)求最大值groupBy("列名",...).min(列名)求最小值groupBy("列名",...).avg(列名)求平均值groupBy("列名",...).sum(列名)求和groupBy("列名",...).count()求个数groupBy("列名",...).agg可以将多个方法进行聚合valrdd=sc.makeRDD(List((1,"a","bj"),(2,"b","sh"),(3,"c","gz"),(4,"d","bj"),(5,"e","gz")));valdf=rdd.toDF("id","name","addr");df.groupBy("addr").count().show()(5)连接查询valdept=sc.parallelize(List((100,"财务部"),(200,"研发部"))).toDF("deptid","deptname")valemp=sc.parallelize(List((1,100,"张财务"),(2,100,"李会计"),(3,200,"王艳发"))).toDF("id","did","name")dept.join(emp,“deptid” ===
"
d
i
d
"
)
.
s
h
o
w
d
e
p
t
.
j
o
i
n
(
e
m
p
,
"did").show dept.join(emp,
"did").showdept.join(emp,“deptid” ===
"
d
i
d
"
,
"
l
e
f
t
"
)
.
s
h
o
w
d
e
p
t
.
j
o
i
n
(
e
m
p
,
"did","left").show dept.join(emp,
"did","left").showdept.join(emp,“deptid” ===
"
d
i
d
"
,
"
r
i
g
h
t
"
)
.
s
h
o
w
(
6
)
执
行
运
算
v
a
l
d
f
=
s
c
.
m
a
k
e
R
D
D
(
L
i
s
t
(
1
,
2
,
3
,
4
,
5
)
)
.
t
o
D
F
(
"
n
u
m
"
)
;
d
f
.
s
e
l
e
c
t
(
"did","right").show (6)执行运算 val df = sc.makeRDD(List(1,2,3,4,5)).toDF("num"); df.select(
"did","right").show(6)执行运算valdf=sc.makeRDD(List(1,2,3,4,5)).toDF("num");df.select(“num” * 100).show
(7)使用列表
val df = sc.makeRDD(List((“zhang”,Array(“bj”,“sh”)),(“li”,Array(“sz”,“gz”)))).toDF(“name”,“addrs”)
df.selectExpr(“name”,“addrs[0]”).show
(8)使用结构体
{“name”:“陈晨”,“address”:{“city”:“西安”,“street”:“南二环甲字1号”}}
{“name”:“娜娜”,“address”:{“city”:“西安”,“street”:“南二环甲字2号”}}
val df = sqlContext.read.json(“file:///root/work/users.json”)
dfs.select(“name”,“address.street”).show
(9)其他
df.count//获取记录总数
val row = df.first()//获取第一条记录
val value = row.getString(1)//获取该行指定列的值
df.collect //获取当前df对象中的所有数据为一个Array 其实就是调用了df对象对应的底层的rdd的collect方法
直接利用sql来操作DataFrame
(0)创建表
创建临时表 - 会话结束表被删除
df.registerTempTable("tabName")
在spark-2.2.1 里面改变城成了下面的方式。
createGlobalTempView createOrReplaceGlobalTempView createOrReplaceTempView createTempView
创建持久表 - 会话结束表也不删除
df.saveAsTable("tabName")
(1)查询
val sqlContext = new org.apache.spark.sql.SQLContext(sc);
val df = sc.makeRDD(List((1,"a","bj"),(2,"b","sh"),(3,"c","gz"),(4,"d","bj"),(5,"e","gz"))).toDF("id","name","addr");
df.registerTempTable("stu");
sqlContext.sql("select * from stu").show()
(2)带条件的查询
val df = sc.makeRDD(List((1,"a","bj"),(2,"b","sh"),(3,"c","gz"),(4,"d","bj"),(5,"e","gz"))).toDF("id","name","addr");
df.registerTempTable("stu");
sqlContext.sql("select * from stu where addr = 'bj'").show()
(3)排序查询
val sqlContext = new org.apache.spark.sql.SQLContext(sc);
val df = sc.makeRDD(List((1,"a","bj"),(2,"b","sh"),(3,"c","gz"),(4,"d","bj"),(5,"e","gz"))).toDF("id","name","addr");
df.registerTempTable("stu");
sqlContext.sql("select * from stu order by addr").show()
(4)分组查询
val sqlContext = new org.apache.spark.sql.SQLContext(sc);
val df = sc.makeRDD(List((1,"a","bj"),(2,"b","sh"),(3,"c","gz"),(4,"d","bj"),(5,"e","gz"))).toDF("id","name","addr");
df.registerTempTable("stu");
sqlContext.sql("select addr,count(*) from stu group by addr").show()
(5)连接查询
val sqlContext = new org.apache.spark.sql.SQLContext(sc);
val dept=sc.parallelize(List((100,"财务部"),(200,"研发部"))).toDF("deptid","deptname")
val emp=sc.parallelize(List((1,100,"张财务"),(2,100,"李会计"),(3,200,"王艳发"))).toDF("id","did","name")
dept.registerTempTable("deptTab");
emp.registerTempTable("empTab");
sqlContext.sql("select deptname,name from deptTab inner join empTab on deptTab.deptid = empTab.did").show()
(6)执行运算
val sqlContext = new org.apache.spark.sql.SQLContext(sc);
val df = sc.makeRDD(List(1,2,3,4,5)).toDF("num");
df.registerTempTable("tabx")
sqlContext.sql("select num * 100 from tabx").show();
(7)分页查询
val sqlContext = new org.apache.spark.sql.SQLContext(sc);
val df = sc.makeRDD(List(1,2,3,4,5)).toDF("num");
df.registerTempTable("tabx")
sqlContext.sql("select * from tabx limit 3").show();
(8)查看表
val sqlContext = new org.apache.spark.sql.SQLContext(sc);
val df1 = sc.makeRDD(List(1,2,3,4,5)).toDF("num");
df1.registerTempTable("tabx1")
val df2 = sc.makeRDD(List(1,2,3,4,5)).toDF("num");
df1.saveAsTable("tabx2")
val sqlContext = new org.apache.spark.sql.SQLContext(sc);
sqlContext.sql("show tables").show
(9)类似hive方式的操作
val hiveContext = new org.apache.spark.sql.hive.HiveContext(sc)
hiveContext.sql("CREATE TABLE IF NOT EXISTS zzz (key INT, value STRING) row format delimited fields terminated by '|'")
hiveContext.sql("LOAD DATA LOCAL INPATH 'file:///root/work/hdata.txt' INTO TABLE zzz")
val df5 = hiveContext.sql("select key,value from zzz")