对比方法:
1、写入Kafka一个时间戳。
2、使用Spark 的 Structured Streaming 去消费,解析出时间戳,使用当前时间戳减去第一步里面的时间戳,输出耗时
3、同时使用Flink消费第一步的时间戳,使用当前时间戳减去这个时间戳,输出耗时。
4、结论:(1)、Structured Streaming 准实时处理,延时340毫秒左右。
(2)、Flink Streaming 实时处理,延时3毫秒左右.
(3)、Structured Streaming 比 Flink Streaming 慢340毫秒左右。
其中:
Spark实现:
package cn.sparkStream.test;
import com.alibaba.druid.support.json.JSONUtils;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.streaming.OutputMode;
import org.apache.spark.sql.streaming.StreamingQuery;
import org.apache.spark.sql.streaming.StreamingQueryException;
import java.util.Date;
import java.util.HashMap;
public class StreamTest {
public static Logger logger = Logger.getLogger(StreamTest.class);
public static void main(String[] args) throws Exception {
Logger.getLogger("org.apache.spark").setLevel(Level.WARN);
Logger.getLogger("org.apache.kafka").setLevel(Level.WARN);
final SparkSession spark = SparkSession.builder()
.appName("StructuredNetworkWordCount")
.master("local")
//.config(conf)
.getOrCreate();
Dataset<Row> inputStream = spark.readStream()
.format("kafka")
.option("kafka.bootstrap.servers", "192.168.90.100:9092")//"host1:port1,host2:port2"
.option("subscribe", "testSpeed")//"topic1,topic2"
.load();
Dataset<String> map = inputStream.map(new MapFunction<Row, String>() {
public String call(Row row) throws Exception {
try {
String line = new String((byte[]) row.get(1));
String[] split = line.split("_");
Long sentTime = Long.parseLong(split[2]);
System.out.println("spark耗时:"+(new Date().getTime()-sentTime));
HashMap map = new HashMap();
map.put("name", split[0]);
map.put("age", split[1]);
return JSONUtils.toJSONString(map);
} catch (Exception e) {
System.out.println(e.getMessage());
}
return null;
}
}, Encoders.STRING());
StreamingQuery console = map.writeStream()
.outputMode(OutputMode.Append())
.format("console")
.start();
console.awaitTermination();
}
}
Flink实现:
package cn.logClean.flink;
import cn.logClean.flink.service.hostKpi.HostKpiService;
import cn.logClean.flink.service.log.entity.LogData;
import cn.logClean.flink.function.*;
import cn.logClean.flink.utils.Constant;
import cn.logClean.flink.utils.ReadConfigation;
import com.alibaba.fastjson.JSON;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer09;
import org.apache.flink.util.Collector;
import java.util.*;
/**
* Created by Shaqf on 2018/6/22.
*/
public class StartMain {
public static void main(String[] args) throws Exception {
System.out.println("系统启动...");
ReadConfigation.setConfig(args);
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.enableCheckpointing(5000);
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);
// kafka配置文件
String ZOOKEEPER_HOST = ReadConfigation.getConfigItem("kafka.zookeeper.connect");//"192.168.90.100:2181/kafka";
String KAFKA_BROKER = ReadConfigation.getConfigItem("kafka.metadata.broker.list");//"192.168.1.139:9092";
String TRANSACTION_GROUP = ReadConfigation.getConfigItem("kafka.customer.groupid");
String AUTO_OFFSET_RESET= ReadConfigation.getConfigItem("kafka.auto.offset.reset");
Integer Parallelism_NM= ReadConfigation.getConfigItemInteger("flink.parallelism.nm");
// kafka配置文件
Properties kafkaProps = new Properties();
kafkaProps.setProperty("zookeeper.connect", ZOOKEEPER_HOST);
kafkaProps.setProperty("bootstrap.servers", KAFKA_BROKER);
kafkaProps.setProperty("group.id", TRANSACTION_GROUP);
kafkaProps.setProperty("auto.offset.reset", AUTO_OFFSET_RESET);
//测试
DataStream<String> testStream = env.addSource(new FlinkKafkaConsumer09<>("testSpeed", new SimpleStringSchema(), kafkaProps));
testStream.flatMap(new FlatMapFunction<String,String>(){
public void flatMap(String line, Collector<String> collector){
String[] split = line.split("_");
Long sentTime = Long.parseLong(split[2]);
System.out.println("flink耗时:"+(new Date().getTime()-sentTime));
}
});
env.execute("streamingFlink");
}
}
Spark耗时截图:
Flink耗时截图: