1.shell命令 求wordcount
wc.txt内容:
hello world
hello spark
hello scala
hello hive
hello hbase regionServer
shell命令为:
cat wc.txt|awk -F " " 'BEGIN{} {for(i=1;i<=NF;i++){wordName[$i]=$i;wc[$i]++}} END{for(j in wordName){print j":"wc[j]}}'
结果:
world:1
scala:1
spark:1
hbase:1
regionServer:1
hive:1
hello:5
2.hive版wordcount
wordcount表中有一列 wc列
wc列的值:hello world hello spark hello scala hello java
使用到的用法:
SELECT explode(myCol) AS myNewCol FROM myTable; #explode就是将hive一行中复杂的array或者map结构拆分成多行。
SELECT pageid, adid
FROM pageAds LATERAL VIEW explode(adid_list) adTable AS adid GROUP BY adid; #explode将复杂结构一行拆成多行,然后再用lateral view做各种聚合,adTable为拆分后的表名,adid为表名中的列
hive sql语句
select col1,count(*) from (select explode(split(wc," ")) as col1 from wordcount) t group by col1 order by desc
3.spark core求wordcount
val conf= new SparkConf().setMaster("local[1]").setAppName("wordcoount") val sc= new SparkContext(conf) val lines=sc.textFile(args(0)) val wc=lines.flatmap(_.split(" ")).map(word=>(word,1)).reduceByKey(_+_) wc.foreach(println)