1、导入spark相关依赖:
<properties>
<java.version>1.8</java.version>
<spark.version>2.1.0</spark.version>
<scala.version>2.11</scala.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>2.11.11</version>
</dependency>
</dependencies>
2、用java代码操作spark框架,对热词进行分析(取出排名前十的热点数据)
package com.spark.test;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
import java.util.List;
public class SparkTest<res> {
public static void main(String[] args) {
//1、设置spark运行环境
SparkConf conf=new SparkConf().setAppName("HotWords").setMaster("local[2]");
//2、生成SparkContext对象
JavaSparkContext context=new JavaSparkContext(conf);
//3、加载数据生成Rdd
JavaRDD<String> firstRdd=context.textFile("C:\\Users\\spectre\\Desktop\\Sogou.sample",1);
//4、封装结果(将关键词封装MAP)
JavaPairRDD<String,Integer> secondRdd= firstRdd.mapToPair(new PairFunction<String,String,Integer>(){
public Tuple2<String,Integer> call(String s) throws Exception{
//按制表符切割,取出第三个下标数据
String word = s.split("\t")[2];
//将数据以(key,1)的方式返回
return new Tuple2<String,Integer>(word,1);
}
});
//将map中的value值相加
JavaPairRDD<String,Integer> thirdRdd=secondRdd.reduceByKey(new Function2<Integer,Integer,Integer>(){
public Integer call(Integer v1,Integer v2) throws Exception {
return v1+v2;
}
});
JavaPairRDD<Integer, String> fourRdd=thirdRdd.mapToPair(new PairFunction<Tuple2<String,Integer>,Integer,String>(){
public Tuple2<Integer,String> call(Tuple2<String,Integer> stringIntegerTuple2) throws Exception{
//将map中的key和value值交换位置
return stringIntegerTuple2.swap();
}
});
//按照key值的大小进行降序排序
JavaPairRDD<Integer, String> fiveRdd=fourRdd.sortByKey(false);
//取出排名前十的数据,封装成List
List<Tuple2<Integer,String>> res=fiveRdd.take(10);
//循环取出前十的数据
for(Tuple2<Integer,String> ret : res){
System.out.println(ret._2+"++++++"+ret._1);
}
}
}