1.下载nc造假数据(生产者) yum install -y nc
2.往指定端口发送数据 nc -lk 9999
3.编写sparkStreaming程序(1.6版本)
引入依赖2.11表示scala版本 1.6.1表示spark版本
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>1.6.1</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>1.6.1</version>
<scope>provided</scope>
</dependency>
</dependencies>
如果使用的是2.4版本的pom,启动会报数组下标越界异常,新增一个依赖可解决
<dependency> <groupId>com.thoughtworks.paranamer</groupId> <artifactId>paranamer</artifactId> <version>2.8</version> </dependency>
简单流式代码
package streaming;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.*;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaReceiverInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import scala.Tuple2;
import java.util.Arrays;
public class Stream {
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf().setAppName("test").setMaster("local[2]");//设置配置
JavaSparkContext sc = new JavaSparkContext(sparkConf);
JavaStreamingContext ssc = new JavaStreamingContext(sc, Durations.seconds(5));//一批数据时间间隔
sc.setLogLevel("WARN"); //设置日志运行级别
JavaReceiverInputDStream<String> lines = ssc.socketTextStream("mini1", 9999);//获取连接
JavaDStream<String> stringJavaDStream = lines.flatMap(new FlatMapFunction<String, String>() {//按照空格分割
@Override
public Iterable<String> call(String line) throws Exception {
return Arrays.asList(line.split(" "));
}
});
JavaPairDStream<String, Integer> pairDStream = stringJavaDStream.mapToPair(new PairFunction<String, String, Integer>() {//拼接二元
@Override
public Tuple2<String, Integer> call(String s) throws Exception {
return new Tuple2<>(s, 1);
}
});
JavaPairDStream<String, Integer> dStream = pairDStream.reduceByKey(new Function2<Integer, Integer, Integer>() {//累加
@Override
public Integer call(Integer integer, Integer integer2) throws Exception {
return integer + integer2;
}
});
dStream.print();// 在控制台打印出在这个离散流(DStream)
ssc.start();// 启动计算
ssc.awaitTermination();//阻塞
}
}
高级运用
1.窗口函数
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.*;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaReceiverInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import scala.Tuple2;
import java.util.Arrays;
public class Stream {
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf().setAppName("test").setMaster("local[2]");//设置配置
JavaSparkContext sc = new JavaSparkContext(sparkConf);
JavaStreamingContext ssc = new JavaStreamingContext(sc, Durations.seconds(5));//一批数据时间间隔
sc.setLogLevel("WARN"); //设置日志运行级别
JavaReceiverInputDStream<String> lines = ssc.socketTextStream("mini1", 9999);//获取连接
JavaDStream<String> stringJavaDStream = lines.flatMap(new FlatMapFunction<String, String>() {//按照空格分割
@Override
public Iterable<String> call(String line) throws Exception {//切割单词
return Arrays.asList(line.split(" "));
}
});
JavaPairDStream<String, Integer> pairDStream = stringJavaDStream.mapToPair(new PairFunction<String, String, Integer>() {//拼接二元
@Override
public Tuple2<String, Integer> call(String s) throws Exception {
return new Tuple2<>(s, 1);
}
});
// 窗口长度时间与滑动距离时间必须是Streaming是批次时间间隔整数倍,窗口长度可以理解为处理多少批数据,滑动时间表示处理时间间隔多少批次,以批时间为最小单位
JavaPairDStream<String, Integer> keyAndWindow = pairDStream.reduceByKeyAndWindow(new Function2<Integer, Integer, Integer>() {//统计
@Override
public Integer call(Integer integer, Integer integer2) throws Exception {
return integer + integer2;
}
}, Durations.seconds(5), Durations.seconds(5));//窗口长度与滑动时间
keyAndWindow.print();// 在控制台打印出在这个离散流(DStream)
ssc.start();// 启动计算
ssc.awaitTermination();//阻塞
}
}
有状态的批处理(涉及历史数据)
import com.google.common.base.Optional;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.*;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaReceiverInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import scala.Tuple2;
import java.util.Arrays;
import java.util.List;
public class Stream {
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf().setAppName("test").setMaster("local[2]");//设置配置
JavaSparkContext sc = new JavaSparkContext(sparkConf);
JavaStreamingContext ssc = new JavaStreamingContext(sc, Durations.seconds(5));//一批数据时间间隔
ssc.checkpoint("hdfs://mini1:9000/checkpoint/");//读取累计需要设置checkPoint检查点
sc.setLogLevel("WARN"); //设置日志运行级别
JavaReceiverInputDStream<String> lines = ssc.socketTextStream("mini1", 9999);//获取连接
JavaDStream<String> stringJavaDStream = lines.flatMap(new FlatMapFunction<String, String>() {//按照空格分割
@Override
public Iterable<String> call(String line) throws Exception {//切割单词
return Arrays.asList(line.split(" "));
}
});
JavaPairDStream<String, Integer> pairDStream = stringJavaDStream.mapToPair(new PairFunction<String, String, Integer>() {//拼接二元
@Override
public Tuple2<String, Integer> call(String s) throws Exception {
return new Tuple2<>(s, 1);
}
});
// 窗口长度时间与滑动距离时间必须是Streaming是批次时间间隔整数倍
JavaPairDStream<String, Integer> dStream = pairDStream.updateStateByKey(new Function2<List<Integer>, Optional<Integer>, Optional<Integer>>() {
@Override
public Optional<Integer> call(List<Integer> integers, Optional<Integer> optional) throws Exception {//<当前批次数据,历史数据>
Integer updatedValue = 0;//默认初始值
if (optional.isPresent()) {//获取历史数据
updatedValue = optional.get();
}
for (Integer value : integers) {//累加
updatedValue += value;
}
return Optional.of(updatedValue);//返回结果
}
}, 10);
dStream.print();// 在控制台打印出在这个离散流(DStream)
ssc.start();// 启动计算
ssc.awaitTermination();//阻塞
}
}