Spark单机运行时,都是跑Main方法,那如何集成到Spring Boot实现http调用呢?
接下实现一个从一个文本里排序出频次最高的前10名
项目环境:
JDK:1.8;
Spark:2.2.0
项目搭建:
pom.xml 依赖:
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>1.5.3.RELEASE</version>
<relativePath /> <!-- lookup parent from repository -->
</parent>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<java.version>1.8</java.version>
<scala.version>2.11</scala.version>
<spark.version>2.2.0</spark.version>
</properties>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-aop</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.version}</artifactId>
<version>${spark.version}</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
<exclusion>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
</exclusion>
</exclusions>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_${scala.version}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-configuration-processor</artifactId>
<optional>true</optional>
</dependency>
</dependencies>
配置类:
@Configuration
@ConfigurationProperties(prefix = "spark")
public class SparkContextBean {
private String sparkHome = ".";
private String appName = "sparkTest";
private String master = "local";
@Bean
@ConditionalOnMissingBean(SparkConf.class)
public SparkConf sparkConf() throws Exception {
SparkConf conf = new SparkConf().setAppName(appName).setMaster(master);
return conf;
}
@Bean
@ConditionalOnMissingBean(JavaSparkContext.class)
public JavaSparkContext javaSparkContext() throws Exception {
return new JavaSparkContext(sparkConf());
}
public String getSparkHome() {
return sparkHome;
}
public void setSparkHome(String sparkHome) {
this.sparkHome = sparkHome;
}
public String getAppName() {
return appName;
}
public void setAppName(String appName) {
this.appName = appName;
}
public String getMaster() {
return master;
}
public void setMaster(String master) {
this.master = master;
}
}
实现类:
@Service
public class SparkTestService {
private static final Logger logger = LoggerFactory.getLogger(SparkTestService.class);
private static final Pattern SPACE = Pattern.compile(" ");
@Autowired
private JavaSparkContext sc;
public Map<String, Object> calculateTopTen() {
Map<String, Object> result = new HashMap<String, Object>();
JavaRDD<String> lines = sc.textFile("src/test/java/test.txt").cache();
System.out.println();
System.out.println("-------------------------------------------------------");
System.out.println(lines.count());
JavaRDD<String> words = lines.flatMap(str -> Arrays.asList(SPACE.split(str)).iterator());
JavaPairRDD<String, Integer> ones = words.mapToPair(str -> new Tuple2<String, Integer>(str, 1));
JavaPairRDD<String, Integer> counts = ones.reduceByKey((Integer i1, Integer i2) -> (i1 + i2));
JavaPairRDD<Integer, String> temp = counts.mapToPair(tuple -> new Tuple2<Integer, String>(tuple._2, tuple._1));
JavaPairRDD<String, Integer> sorted = temp.sortByKey(false).mapToPair(tuple -> new Tuple2<String, Integer>(tuple._2, tuple._1));
System.out.println();
System.out.println("-------------------------------------------------------");
System.out.println(sorted.count());
//List<Tuple2<String, Integer>> output = sorted.collect();
//List<Tuple2<String, Integer>> output = sorted.take(10);
List<Tuple2<String, Integer>> output = sorted.top(10);
for (Tuple2<String, Integer> tuple : output) {
result.put(tuple._1(), tuple._2());
}
return result;
}
/**
* 练习demo,熟悉其中API
*/
public void sparkExerciseDemo() {
List<Integer> data = Lists.newArrayList(1,2,3,4,5,6);
JavaRDD<Integer> rdd01 = sc.parallelize(data);
rdd01 = rdd01.map(num ->{
return num * num;
});
//data map :1,4,9,16,25,36
logger.info("data map :{}",Joiner.on(",").skipNulls().join(rdd01.collect()).toString());
rdd01 = rdd01.filter(x -> x < 6);
//data filter :1,4
logger.info("data filter :{}",Joiner.on(",").skipNulls().join(rdd01.collect()).toString());
rdd01 = rdd01.flatMap( x ->{
Integer[] test = {x,x+1,x+2};
return Arrays.asList(test).iterator();
});
//flatMap :1,2,3,4,5,6
logger.info("flatMap :{}",Joiner.on(",").skipNulls().join(rdd01.collect()).toString());
JavaRDD<Integer> unionRdd = sc.parallelize(data);
rdd01 = rdd01.union(unionRdd);
//union :1,2,3,4,5,6,1,2,3,4,5,6
logger.info("union :{}",Joiner.on(",").skipNulls().join(rdd01.collect()).toString());
List<Integer> result = Lists.newArrayList();
result.add(rdd01.reduce((Integer v1,Integer v2) -> {
return v1+v2;
}));
//reduce :42
logger.info("reduce :{}",Joiner.on(",").skipNulls().join(result).toString());
result.forEach(System.out::print);
JavaPairRDD<Integer,Iterable<Integer>> groupRdd = rdd01.groupBy(x -> {
logger.info("======grouby========:{}",x);
if (x > 10) return 0;
else return 1;
});
List<Tuple2<Integer,Iterable<Integer>>> resul = groupRdd.collect();
//group by key:1 value:1,2,3,4,5,6,1,2,3,4,5,6
resul.forEach(x -> {
logger.info("group by key:{} value:{}",x._1,Joiner.on(",").skipNulls().join(x._2).toString());
});
}
/**
* spark streaming 练习
*/
public void sparkStreaming() throws InterruptedException {
JavaStreamingContext jsc = new JavaStreamingContext(sc,Durations.seconds(10));//批间隔时间
JavaReceiverInputDStream<String> lines = jsc.receiverStream(new CustomReceiver(StorageLevel.MEMORY_AND_DISK_2()));
JavaDStream<Long> count = lines.count();
count = count.map(x -> {
logger.info("这次批一共多少条数据:{}",x);
return x;
});
count.print();
jsc.start();
jsc.awaitTermination();
jsc.stop();
}
}
/**
* 自定义接收streaming类
*/
public class CustomReceiver extends Receiver<String>{
private static Logger logger = LoggerFactory.getLogger(CustomReceiver.class);
/**
*
* @author hz15041240
* @date 2018年1月18日 下午4:37:22
* @version
*/
private static final long serialVersionUID = 5817531198342629801L;
public CustomReceiver(StorageLevel storageLevel) {
super(storageLevel);
}
@Override
public void onStart() {
new Thread(this::doStart).start();
logger.info("开始启动Receiver...");
//doStart();
}
public void doStart() {
while(!isStopped()) {
int value = RandomUtils.nextInt(100);
if(value <20) {
try {
Thread.sleep(1000);
}catch (Exception e) {
logger.error("sleep exception",e);
restart("sleep exception", e);
}
}
store(String.valueOf(value));
}
}
@Override
public void onStop() {
logger.info("即将停止Receiver...");
}
}
@RestController
public class DemoController {
@Autowired
private SparkTestService sparkTestService;
@RequestMapping("/demo/top10")
public Map<String, Object> calculateTopTen() {
return sparkTestService.calculateTopTen();
}
@RequestMapping("/demo/exercise")
public void exercise() {
sparkTestService.sparkExerciseDemo();
}
@RequestMapping("/demo/stream")
public void streamingDemo() throws InterruptedException {
sparkTestService.sparkStreaming();
}
}
application.yml:
server:
port: 8054
spark:
spark-home: .
app-name: sparkTest
master: local[4]
在项目的 src/test/java 目录下新建一个test.txt文件,立面随便一堆随机的字符就可以了。
启动项目,访问:http://localhost:8054/demo/top10 就能得到前10频率词汇了。