概述
StructedStreaming是基于Spark Sql的流式处理框架,支持datafram和DataSet数据类型。对于开发者,在使用strucedStreaming时,不需要去考虑使用的是流式计算还是离线计算,只需要使用相同的方式进行处理。
StructedStreaming支持了连续流模型,类似于Flink的实时流,而不是以前的micro batch,但是在使用的时候仍然有限制,大部分情况还是在应用小批量模式。
spark2.2之后,StructedStreaming已经被标注位稳定版,意味着spark的流式计算已经被StructedStreaming所占据。
Spark读取本地文件,写入到HBase中,Kafka
package com.zorel.sparktohbase;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.ForeachWriter;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.streaming.StreamingQuery;
import org.apache.spark.sql.streaming.StreamingQueryException;
import org.apache.spark.sql.streaming.Trigger;
import org.apache.spark.sql.types.StructType;
import org.json.JSONObject;
import static org.apache.spark.sql.types.DataTypes.StringType;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.spark.sql.AnalysisException;
import static org.apache.spark.sql.types.DataTypes.*;
import com.google.gson.JsonObject;
import com.zorel.interchange.crud.CrudHBase;
public class SparkRealize implements Functions{
static private SparkSession ss = null;
static{
System.setProperty("HADOOP_USER_NAME","root");
ss = SparkUtils.getSparkSession("Stream");
}
public void sparktohbase() {
// TODO Auto-generated method stub
//File数据源,必须指定schema
StructType scheam = new StructType().add("event_time", StringType)
.add("flow_id", LongType)
.add("event_unixtime", TimestampType)
.add("in_iface", StringType)
.add("event_type", StringType)
.add("src_ip", StringType)
.add("src_port", IntegerType)
.add("dest_ip", StringType)
.add("dest_port", StringType)
.add("proto", StringType)
.add("tx_id", IntegerType)
.add("alert", new StructType()
.add("action", StringType)
.add("gid", IntegerType)
.add("qr_id", StringType)
.add("signature_id", LongType)
.add("rev", IntegerType)
.add("signature", StringType)
.add("category", StringType)
.add("severity", IntegerType)
.add("tx_id", IntegerType)
.add("cm_id", StringType))
.add("http", new StructType()
.add("hostname", StringType)
.add("url", StringType)
.add("http_user_agent", StringType)
.add("http_content_type", StringType)
.add("http_method", StringType)
.add("protocol", StringType)
.add("status", IntegerType)
.add("length", IntegerType)
.add("http_request_headers", StringType)
.add("http_request_headers_len", IntegerType)
.add("http_response_headers", StringType)
.add("http_response_headers_len", IntegerType)
)
.add("fileinfo", new StructType().add("fstorepath", StringType))
.add("src_country", StringType)
.add("src_city", StringType)
.add("src_ip_val", LongType)
.add("dest_ip_val", LongType)
.add("ip_type", IntegerType)
.add("clusterid", StringType);
//Dataset<org.apache.spark.sql.Row> jsons = ss.readStream().schema(scheam).format("json").load("C:\\java\\file");
Dataset<org.apache.spark.sql.Row> jsons = ss.read().format("json").load("file:///usr/test");
try {
jsons.createOrReplaceGlobalTempView("Temp");
Dataset<org.apache.spark.sql.Row> sql = ss.sql("select * from global_temp.Temp where event_type = 'alert' ");
//sql.show();
//.foreach(new HBaseSink())
//.foreachBatch()
//.format("console")
//.outputMode("append")
//.start();
//foreachQuery.awaitTermination();
List<String> str = sql.toJSON().collectAsList();
List list = new ArrayList<JSONObject>();
str.forEach(elem->list.add(new JSONObject(elem)));
CrudHBase cj = new CrudHBase();
cj.addBatch("eve_http", list);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
@Override
public void fileSparSqlToConsole() {
// TODO Auto-generated method stub
Dataset<org.apache.spark.sql.Row> jsons = ss.read().format("json").load("C:\\java\\file");
jsons.createOrReplaceGlobalTempView("Temp");
Dataset<org.apache.spark.sql.Row> sql = ss.sql("select * from global_temp.Temp where event_type = 'alert' ");
sql.show();
List<String> str = sql.toJSON().collectAsList();
str.forEach(elem->System.out.println(elem));
ss.close();
}
@Override
public void fileSparkSqlToHbase() {
// TODO Auto-generated method stub
Dataset<org.apache.spark.sql.Row> jsons = ss.read().format("json").load("file:///opt/test");
jsons.createOrReplaceGlobalTempView("Temp");
Dataset<org.apache.spark.sql.Row> sql = ss.sql("select * from global_temp.Temp where event_type = 'alert' ");
List<String> str = sql.toJSON().collectAsList();
List list = new ArrayList<JSONObject>();
str.forEach(elem->list.add(new JSONObject(elem)));
CrudHBase cj = new CrudHBase();
cj.addBatch("eve_http", list);
}
@Override
public void fileSparkSqlToClickhouse() {
// TODO Auto-generated method stub
Dataset<org.apache.spark.sql.Row> jsons = ss.read().format("json").load("C:\\java\\file");
jsons.createOrReplaceGlobalTempView("Temp");
Dataset<org.apache.spark.sql.Row> sql = ss.sql("select * from global_temp.Temp where event_type = 'alert' ");
List<String> str = sql.toJSON().collectAsList();
List list = new ArrayList<JSONObject>();
str.forEach(elem->list.add(new JSONObject(elem)));
//CrudClickhouse cj = new CrudClickhouse();
//cj.addBatch("eve_http", list);
}
@Override
public void fileSparkStreamToConsole() {
// TODO Auto-generated method stub
StructType scheam = new StructType().add("event_time", StringType)
.add("flow_id", LongType)
.add("event_unixtime", TimestampType)
.add("in_iface", StringType)
.add("event_type", StringType)
.add("src_ip", StringType)
.add("src_port", IntegerType)
.add("dest_ip", StringType)
.add("dest_port", StringType)
.add("proto", StringType)
.add("tx_id", IntegerType)
.add("alert", new StructType()
.add("action", StringType)
.add("gid", IntegerType)
.add("qr_id", StringType)
.add("signature_id", LongType)
.add("rev", IntegerType)
.add("signature", StringType)
.add("category", StringType)
.add("severity", IntegerType)
.add("tx_id", IntegerType)
.add("cm_id", StringType))
.add("http", new StructType()
.add("hostname", StringType)
.add("url", StringType)
.add("http_user_agent", StringType)
.add("http_content_type", StringType)
.add("http_method", StringType)
.add("protocol", StringType)
.add("status", IntegerType)
.add("length", IntegerType)
.add("http_request_headers", StringType)
.add("http_request_headers_len", IntegerType)
.add("http_response_headers", StringType)
.add("http_response_headers_len", IntegerType)
)
.add("fileinfo", new StructType().add("fstorepath", StringType))
.add("src_country", StringType)
.add("src_city", StringType)
.add("src_ip_val", LongType)
.add("dest_ip_val", LongType)
.add("ip_type", IntegerType)
.add("clusterid", StringType);
Dataset<org.apache.spark.sql.Row> jsons = ss.readStream().schema(scheam).format("json").load("file:///usr/test");
try {
jsons.createOrReplaceGlobalTempView("Temp");
Dataset<org.apache.spark.sql.Row> sql = ss.sql("select * from global_temp.Temp where event_type = 'alert' ");
sql.writeStream()
//.foreach(new HBaseSink())
.foreachBatch((a, b)->{
List<String> list = a.toJSON().collectAsList();
List arr = new ArrayList<JSONObject>();
list.forEach(elem->arr.add(new JSONObject(elem)));
})
.format("console")
.start()
.awaitTermination();
}catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
@Override
public void fileSparkStreamToHbase() {
// TODO Auto-generated method stub
StructType scheam = new StructType().add("event_time", StringType)
.add("flow_id", LongType)
.add("event_unixtime", TimestampType)
.add("in_iface", StringType)
.add("event_type", StringType)
.add("src_ip", StringType)
.add("src_port", IntegerType)
.add("dest_ip", StringType)
.add("dest_port", StringType)
.add("proto", StringType)
.add("tx_id", IntegerType)
.add("alert", new StructType()
.add("action", StringType)
.add("gid", IntegerType)
.add("qr_id", StringType)
.add("signature_id", LongType)
.add("rev", IntegerType)
.add("signature", StringType)
.add("category", StringType)
.add("severity", IntegerType)
.add("tx_id", IntegerType)
.add("cm_id", StringType))
.add("http", new StructType()
.add("hostname", StringType)
.add("url", StringType)
.add("http_user_agent", StringType)
.add("http_content_type", StringType)
.add("http_method", StringType)
.add("protocol", StringType)
.add("status", IntegerType)
.add("length", IntegerType)
.add("http_request_headers", StringType)
.add("http_request_headers_len", IntegerType)
.add("http_response_headers", StringType)
.add("http_response_headers_len", IntegerType)
)
.add("fileinfo", new StructType().add("fstorepath", StringType))
.add("src_country", StringType)
.add("src_city", StringType)
.add("src_ip_val", LongType)
.add("dest_ip_val", LongType)
.add("ip_type", IntegerType)
.add("clusterid", StringType);
Dataset<org.apache.spark.sql.Row> jsons = ss.readStream().schema(scheam).format("json").load("file:///opt/test");
try {
jsons.createOrReplaceGlobalTempView("Temp");
Dataset<org.apache.spark.sql.Row> sql = ss.sql("select * from global_temp.Temp where event_type = 'alert' ");
sql.writeStream()
//.foreach(new HBaseSink())
.foreachBatch((a, b)->{
List<String> list = a.toJSON().collectAsList();
List arr = new ArrayList<JSONObject>();
list.forEach(elem->arr.add(new JSONObject(elem)));
CrudHBase ch = new CrudHBase();
arr.forEach(elem->System.out.println(elem));
ch.addBatch("eve_http", arr);
})
//.format("console")
.outputMode("append")
.start()
.awaitTermination();
}catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
@Override
public void fileSparkStreamToClickhouse() {
// TODO Auto-generated method stub
StructType scheam = new StructType().add("event_time", StringType)
.add("flow_id", LongType)
.add("event_unixtime", TimestampType)
.add("in_iface", StringType)
.add("event_type", StringType)
.add("src_ip", StringType)
.add("src_port", IntegerType)
.add("dest_ip", StringType)
.add("dest_port", StringType)
.add("proto", StringType)
.add("tx_id", IntegerType)
.add("alert", new StructType()
.add("action", StringType)
.add("gid", IntegerType)
.add("qr_id", StringType)
.add("signature_id", LongType)
.add("rev", IntegerType)
.add("signature", StringType)
.add("category", StringType)
.add("severity", IntegerType)
.add("tx_id", IntegerType)
.add("cm_id", StringType))
.add("http", new StructType()
.add("hostname", StringType)
.add("url", StringType)
.add("http_user_agent", StringType)
.add("http_content_type", StringType)
.add("http_method", StringType)
.add("protocol", StringType)
.add("status", IntegerType)
.add("length", IntegerType)
.add("http_request_headers", StringType)
.add("http_request_headers_len", IntegerType)
.add("http_response_headers", StringType)
.add("http_response_headers_len", IntegerType)
)
.add("fileinfo", new StructType().add("fstorepath", StringType))
.add("src_country", StringType)
.add("src_city", StringType)
.add("src_ip_val", LongType)
.add("dest_ip_val", LongType)
.add("ip_type", IntegerType)
.add("clusterid", StringType);
Dataset<org.apache.spark.sql.Row> jsons = ss.readStream().schema(scheam).format("json").load("C:\\java\\file");
try {
jsons.createOrReplaceGlobalTempView("Temp");
Dataset<org.apache.spark.sql.Row> sql = ss.sql("select * from global_temp.Temp where event_type = 'alert' ");
sql.writeStream()
//.foreach(new HBaseSink())
.foreachBatch((a, b)->{
List<String> list = a.toJSON().collectAsList();
List arr = new ArrayList<JSONObject>();
list.forEach(elem->arr.add(new JSONObject(elem)));
// CrudClickhouse cli = new CrudClickhouse();
//cli.addBatch("", arr);
})
//.format("console")
.outputMode("append")
.start()
.awaitTermination();
}catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
@Override
public void kafkaSparkStreamToClickhouse() {
// TODO Auto-generated method stub
}
@Override
public void kafkaSparkStreamToHbase() {
// TODO Auto-generated method stub
SparkSession session = SparkUtils.getSparkSession("spark");
Dataset<org.apache.spark.sql.Row> ds = session.readStream()
.format("kafka")
.option("kafka.bootstrap.servers", "192.168.10.31:9092")
.option("subscribe", "topic0")
//.option("failOnDataLoss", false)
.option("startingOffsets", "latest")
.load();
Dataset<org.apache.spark.sql.Row> res = ds.selectExpr("CAST(value AS string)");
String str = null;
try {
res.writeStream()
.queryName("query")
.option("checkpointLocation", "file:///usr/checkpoint")
//.option("checkpointLocation", "C://checkpoint")
.foreachBatch((a, b)->{
//a表示存取数据的DataSet,b表示BatchId:Long
List<String> list = a.toJSON().collectAsList();
list.forEach(elem->System.out.println(elem));
List<String> mid = new ArrayList();
list.forEach(elem->{
String s1 = elem.replaceAll("\\\\", "");
String s2 = s1.substring(10);
String s3 = s2.substring(0, s2.length()-1);
mid.add(s3);
});
mid.forEach(elem->System.out.println(elem));
List<JSONObject> result = new ArrayList();
mid.forEach(elem->{
JSONObject json = new JSONObject(elem);
if(json.get("event_type").equals("alert"))
result.add(new JSONObject(elem));
});
//result.forEach(elem->System.out.println(elem));
CrudHBase ch = new CrudHBase();
ch.addBatch("eve_http", result);
})
.outputMode("append")
//.format("console")
.start()
.awaitTermination();
} catch (StreamingQueryException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
@Override
public void kafkaSparkStreamToConsole() {
// TODO Auto-generated method stub
SparkSession session = SparkUtils.getSparkSession("spark");
Dataset<org.apache.spark.sql.Row> ds = session.readStream()
.format("kafka")
.option("kafka.bootstrap.servers", "192.168.10.31:9092")
.option("subscribe", "topic0")
//.option("failOnDataLoss", false)
.option("startingOffsets", "latest")
.load();
Dataset<org.apache.spark.sql.Row> res = ds.selectExpr("CAST(value AS string)");
try {
res.writeStream()
.queryName("query")
.option("checkpointLocation", "D://checkpoint")
.outputMode("append")
.format("console")
.start()
.awaitTermination();
} catch (StreamingQueryException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}