对于一张数据表,如下图所示
对上述表实现分组统计查询: select pageid,age,count(1) from pv_users group by pageid,age;
将上述的SQL查询语句改写成Spark的代码,如下:
package com.company.sparkcore
import org.apache.spark.{SparkConf, SparkContext}
object CountPVByGroup {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setAppName(CountPVByGroup.getClass.getSimpleName)
.setMaster("local")
// Logger.getLogger("org.apache.spark").setLevel(Level.OFF)
// Logger.getLogger("org.apache.hadoop").setLevel(Level.OFF)
val sc = new SparkContext(conf)
val lines = sc.textFile("file:///e:/pv_users.txt")
//拼接成(1_25,1)的形式
val newKeyValue = lines.map(_.split(",")).map(pvdata => (