首先,在cd /usr/spark创建data1.txt
Tom,DataBase,80
Tom,Algorithm,50
Tom,DataStructure,60
Jim,DataBase,90
Jim,Algorithm,60
Jim,DataStructure,8:0
其次,在master进入spark-shell
cd /usr/spark/spark-2.4.1-bin-hadoop2.7/bin/
spark-shell --master spark://master:7077
再者
val lines= sc.textFile("file:///usr/spark/data1.txt")
lines.count() //查看数据多少行
lines.distinct().count() //去重查看数据,以防重复
lines.map(row=>row.split(",")(0)).distinct().count //总共有多少学生
lines.map(row=>row.split(",")(1)).distinct().count //总共开设多少门课
---------------------------------
//Tom的总成绩平均分
lines.filter(row=>row.split(",")(0)=="Tom").map(row=>(row.split(",")(0),row.split(",")(2).toInt))
.mapValues(x=>(x,1))
.reduceByKey((x,y) => (x._1+y._1,x._2 + y._2))
.mapValues(x => (x._1 / x._2))
.collect()
---------------------------------
//每名同学选修课程门数
lines.map(row=>(row.split(",")(0),1))
.reduceByKey((x,y)=>x+y)
.collect
---------------------------------
//该系DataBase课程共有多少人选修;
lines.filter(row=>row.split(",")(1)=="DataBase").count
---------------------------------
//各门课程的平均分是多少;
lines.map(row=>(row.split(",")(1),row.split(",")(2).toInt))
.mapValues(x=>(x,1))
.reduceByKey((x,y) => (x._1+y._1,x._2 + y._2))
.mapValues(x => (x._1 / x._2))
.collect()
---------------------------------
//使用累加器计算共有多少人选了DataBase这门课。
val accum = sc.longAccumulator("My Accumulator")
lines.filter(row=>row.split(",")(1)=="DataBase")
.map(row=>(row.split(",")(1),1))
.values
.foreach(x => accum.add(x))
accum.value
最后,终于丝滑一次