Spark Core API来实现一些案例
create table states_raw(code string, name string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t";
load data local inpath "/home/hadoop/data/seq.txt" overwrite into table states_raw;
create table states_seq(code string, name string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t"
STORED AS SEQUENCEFILE;
insert into table states_seq select * from states_raw;
val file = sc.sequenceFile[BytesWritable, String]("/user/hive/warehouse/states_seq")
file.collect()
map(_*2)
./bin/spark-submit \
--class com.ruozedata.core.ReadSequenceFileApp \
--master local[2] \
/home/hadoop/lib/train-scala-1.0.jar
Array[(String, Iterable[String])] = Array(
(B,CompactBuffer(B1, B2, B3)),
(A,CompactBuffer(A1, A2, A3)),
(C,CompactBuffer(C1))
)
first_value: 每个分组对应的key里面value的第一个元素
(B,List((B1,B1), (B2,B1), (B3,B1)))
(A,List((A1,A1), (A2,A1), (A3,A1)))
(C,List((C1,C1)))
wc sort
map() ==> sort ==> map()
(word,count) ==> (count, word).sort ==> (word,count)
二次排序: A B
((c1,c2),value)
视(c1,c2)为一个整体,对这个整体做排序,最终排序完,只取value即可
col1 col2 col3 col4
==>
col1 和col2 排
col1 col2 col3 col4
((1527,2106),(1527, 2106))
OOM
emp二次排序
last_value
yield(item, items.head) yield是什么意思
trait是什么意思
看scala程序
create table states_raw(code string, name string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t";
load data local inpath "/home/hadoop/data/seq.txt" overwrite into table states_raw;
create table states_seq(code string, name string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t"
STORED AS SEQUENCEFILE;
insert into table states_seq select * from states_raw;
val file = sc.sequenceFile[BytesWritable, String]("/user/hive/warehouse/states_seq")
file.collect()
map(_*2)
./bin/spark-submit \
--class com.ruozedata.core.ReadSequenceFileApp \
--master local[2] \
/home/hadoop/lib/train-scala-1.0.jar
Array[(String, Iterable[String])] = Array(
(B,CompactBuffer(B1, B2, B3)),
(A,CompactBuffer(A1, A2, A3)),
(C,CompactBuffer(C1))
)
first_value: 每个分组对应的key里面value的第一个元素
(B,List((B1,B1), (B2,B1), (B3,B1)))
(A,List((A1,A1), (A2,A1), (A3,A1)))
(C,List((C1,C1)))
wc sort
map() ==> sort ==> map()
(word,count) ==> (count, word).sort ==> (word,count)
二次排序: A B
((c1,c2),value)
视(c1,c2)为一个整体,对这个整体做排序,最终排序完,只取value即可
col1 col2 col3 col4
==>
col1 和col2 排
col1 col2 col3 col4
((1527,2106),(1527, 2106))
OOM
emp二次排序
last_value
yield(item, items.head) yield是什么意思
trait是什么意思
看scala程序