Spark SQL Story
Write less code
spark.table("").groupBy().....
从wc的角度来看:
external datasource api
schema
json:"dirty"
V1:id username age("20")
V2:id username nickname age address
V3:
V4:
{"name":"zhangsan", "gender":"F", "height":160}
{"name":"lisi", "gender":"M", "height":175, "age":30}
{"name":"wangwu", "gender":"M", "height":180.3}
df.printSchema
root
|-- age: long (nullable = true)
|-- gender: string (nullable = true)
|-- height: double (nullable = true)
|-- name: string (nullable = true)
+----+------+------+--------+
| age|gender|height| name|
+----+------+------+--------+
|null| F| 160.0|zhangsan|
| 30| M| 175.0| lisi|
|null| M| 180.3| wangwu|
+----+------+------+--------+
/user/hive/warehouse/emp/d=20180808
/user/hive/warehouse/emp/d=20180809
create table ... partition by(....)
val squaresDF = spark.sparkContext.makeRDD(1 to 5).map(i => (i, i * i)).toDF("value", "square")
squaresDF.write.parquet("/schem_merge/test_table/key=1")
val cubesDF = spark.sparkContext.makeRDD(6 to 10).map(i => (i, i * i * i)).toDF("value", "cube")
cubesDF.write.parquet("/schem_merge/test_table/key=2")
val mergedDF = spark.read.format("parquet").option("mergeSchema", "true").load("/schem_merge/test_table")
mergedDF.printSchema()
+-----+------+---+
|value|square|key|
+-----+------+---+
| 1| 1| 1|
| 2| 4| 1|
| 3| 9| 1|
| 4| 16| 1|
| 5| 25| 1|
| 6| null| 2|
| 7| null| 2|
| 8| null| 2|
| 9| null| 2|
| 10| null| 2|
+-----+------+---+
df.write.format("parquet").mode("ignore").save("/schem_merge/test_table/key=1")
1507 xxxxxx xxxxx
Event Time: 1507
Ingestion time: 1509
Processing Time: 1510
Read less data
Let the optimizer do the hard word
求工资大于30000,只需要name,不需要age和salary
case class Person(name:String, age:Int, salary:Double)
sc.textFile("")
.map(x=>split("\t"))
.map(x => Person(......))
.map(x=> (name, salary))
.filter(_._2 > 30000)
.map(_._1)
.collect
select name from
(select name,salary from person) t
where t.salary > 30000
select name from
(select name,salary from person where salary>30000) t
joined = users.join(events, uses.id===events.id)
filtered = joined.filter(events.date >= "2016-01-01")
函数&UDF
每天的销售额
"2018-01-01,50,1111"
"2018-01-01,60,2222"
"2018-01-01,70,3333"
"2018-01-02,150,1111"
"2018-01-02,250,1111"
hobbies.txt
alice jogging,Coding,cooking 3
lina travel,dance 2
login_ip.txt
alice ip1
lina ip2
sven ip3
alice ip1
sven ip2
alice ip4
alice ip1,ip1,ip4 3
lina ip2 1
Write less code
spark.table("").groupBy().....
从wc的角度来看:
external datasource api
schema
json:"dirty"
V1:id username age("20")
V2:id username nickname age address
V3:
V4:
{"name":"zhangsan", "gender":"F", "height":160}
{"name":"lisi", "gender":"M", "height":175, "age":30}
{"name":"wangwu", "gender":"M", "height":180.3}
df.printSchema
root
|-- age: long (nullable = true)
|-- gender: string (nullable = true)
|-- height: double (nullable = true)
|-- name: string (nullable = true)
+----+------+------+--------+
| age|gender|height| name|
+----+------+------+--------+
|null| F| 160.0|zhangsan|
| 30| M| 175.0| lisi|
|null| M| 180.3| wangwu|
+----+------+------+--------+
/user/hive/warehouse/emp/d=20180808
/user/hive/warehouse/emp/d=20180809
create table ... partition by(....)
val squaresDF = spark.sparkContext.makeRDD(1 to 5).map(i => (i, i * i)).toDF("value", "square")
squaresDF.write.parquet("/schem_merge/test_table/key=1")
val cubesDF = spark.sparkContext.makeRDD(6 to 10).map(i => (i, i * i * i)).toDF("value", "cube")
cubesDF.write.parquet("/schem_merge/test_table/key=2")
val mergedDF = spark.read.format("parquet").option("mergeSchema", "true").load("/schem_merge/test_table")
mergedDF.printSchema()
+-----+------+---+
|value|square|key|
+-----+------+---+
| 1| 1| 1|
| 2| 4| 1|
| 3| 9| 1|
| 4| 16| 1|
| 5| 25| 1|
| 6| null| 2|
| 7| null| 2|
| 8| null| 2|
| 9| null| 2|
| 10| null| 2|
+-----+------+---+
df.write.format("parquet").mode("ignore").save("/schem_merge/test_table/key=1")
1507 xxxxxx xxxxx
Event Time: 1507
Ingestion time: 1509
Processing Time: 1510
Read less data
分析大数据最快的方法是什么? 忽略它
分区, orz\parquet压缩(优先考虑列式存储),过滤,更好的谓词下压(where,join的时候on条件,pushdown)
Let the optimizer do the hard word
求工资大于30000,只需要name,不需要age和salary
case class Person(name:String, age:Int, salary:Double)
sc.textFile("")
.map(x=>split("\t"))
.map(x => Person(......))
.map(x=> (name, salary))
.filter(_._2 > 30000)
.map(_._1)
.collect
select name from
(select name,salary from person) t
where t.salary > 30000
select name from
(select name,salary from person where salary>30000) t
joined = users.join(events, uses.id===events.id)
filtered = joined.filter(events.date >= "2016-01-01")
函数&UDF
每天的销售额
"2018-01-01,50,1111"
"2018-01-01,60,2222"
"2018-01-01,70,3333"
"2018-01-02,150,1111"
"2018-01-02,250,1111"
hobbies.txt
alice jogging,Coding,cooking 3
lina travel,dance 2
login_ip.txt
alice ip1
lina ip2
sven ip3
alice ip1
sven ip2
alice ip4
alice ip1,ip1,ip4 3
lina ip2 1