Structured Streaming

最新推荐文章于 2022-05-27 21:53:38 发布

稷下小鲤鱼

最新推荐文章于 2022-05-27 21:53:38 发布

阅读量237

点赞数

分类专栏： Spark

本文链接：https://blog.csdn.net/qq_43057549/article/details/109340582

版权

Spark 专栏收录该内容

5 篇文章 0 订阅

订阅专栏

概述
StructedStreaming是基于Spark Sql的流式处理框架，支持datafram和DataSet数据类型。对于开发者，在使用strucedStreaming时，不需要去考虑使用的是流式计算还是离线计算，只需要使用相同的方式进行处理。

StructedStreaming支持了连续流模型，类似于Flink的实时流，而不是以前的micro batch，但是在使用的时候仍然有限制，大部分情况还是在应用小批量模式。

spark2.2之后，StructedStreaming已经被标注位稳定版，意味着spark的流式计算已经被StructedStreaming所占据。

Spark读取本地文件，写入到HBase中，Kafka

package com.zorel.sparktohbase;

import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.ForeachWriter;

import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.streaming.StreamingQuery;
import org.apache.spark.sql.streaming.StreamingQueryException;
import org.apache.spark.sql.streaming.Trigger;
import org.apache.spark.sql.types.StructType;
import org.json.JSONObject;

import static org.apache.spark.sql.types.DataTypes.StringType;

import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.spark.sql.AnalysisException;

import static org.apache.spark.sql.types.DataTypes.*;
import com.google.gson.JsonObject;
import com.zorel.interchange.crud.CrudHBase;




public class SparkRealize implements Functions{
	
	static private SparkSession ss = null;
	static{
		System.setProperty("HADOOP_USER_NAME","root");
		ss = SparkUtils.getSparkSession("Stream");
		
	}

	public void sparktohbase() {
		// TODO Auto-generated method stub
		//File数据源，必须指定schema
		StructType scheam = new StructType().add("event_time", StringType)
				.add("flow_id", LongType)
				.add("event_unixtime", TimestampType)
				.add("in_iface", StringType)
				.add("event_type", StringType)
				.add("src_ip", StringType)
				.add("src_port", IntegerType)
				.add("dest_ip", StringType)
				.add("dest_port", StringType)
				.add("proto", StringType)
				.add("tx_id", IntegerType)
				.add("alert", new StructType()
						.add("action", StringType)
						.add("gid", IntegerType)
						.add("qr_id", StringType)
						.add("signature_id", LongType)
						.add("rev", IntegerType)
						.add("signature", StringType)
						.add("category", StringType)
						.add("severity", IntegerType)
						.add("tx_id", IntegerType)
						.add("cm_id", StringType))
				.add("http", new StructType()
						.add("hostname", StringType)
						.add("url", StringType)
						.add("http_user_agent", StringType)
						.add("http_content_type", StringType)
						.add("http_method", StringType)
						.add("protocol", StringType)
						.add("status", IntegerType)
						.add("length", IntegerType)
						.add("http_request_headers", StringType)
						.add("http_request_headers_len", IntegerType)
						.add("http_response_headers", StringType)
						.add("http_response_headers_len", IntegerType)
						)
				.add("fileinfo", new StructType().add("fstorepath", StringType))
				.add("src_country", StringType)
				.add("src_city", StringType)
				.add("src_ip_val", LongType)
				.add("dest_ip_val", LongType)
				.add("ip_type", IntegerType)
				.add("clusterid", StringType);
		//Dataset<org.apache.spark.sql.Row> jsons = ss.readStream().schema(scheam).format("json").load("C:\\java\\file");
		Dataset<org.apache.spark.sql.Row> jsons = ss.read().format("json").load("file:///usr/test");
		try {
			jsons.createOrReplaceGlobalTempView("Temp");
			Dataset<org.apache.spark.sql.Row> sql = ss.sql("select * from global_temp.Temp where event_type = 'alert' ");
			//sql.show();
		    //.foreach(new HBaseSink())
			//.foreachBatch()
		    //.format("console")
		    //.outputMode("append")
		    //.start();
			//foreachQuery.awaitTermination();
			List<String> str = sql.toJSON().collectAsList();
			List list = new ArrayList<JSONObject>();
			str.forEach(elem->list.add(new JSONObject(elem)));
			CrudHBase cj = new CrudHBase();
			cj.addBatch("eve_http",	list);
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
	
	@Override
	public void fileSparSqlToConsole() {
		// TODO Auto-generated method stub
		Dataset<org.apache.spark.sql.Row> jsons = ss.read().format("json").load("C:\\java\\file");
		jsons.createOrReplaceGlobalTempView("Temp");
		Dataset<org.apache.spark.sql.Row> sql = ss.sql("select * from global_temp.Temp where event_type = 'alert' ");
		sql.show();
		List<String> str = sql.toJSON().collectAsList();
		str.forEach(elem->System.out.println(elem));
		ss.close();
	}


	@Override
	public void fileSparkSqlToHbase() {
		// TODO Auto-generated method stub
		Dataset<org.apache.spark.sql.Row> jsons = ss.read().format("json").load("file:///opt/test");
		jsons.createOrReplaceGlobalTempView("Temp");
		Dataset<org.apache.spark.sql.Row> sql = ss.sql("select * from global_temp.Temp where event_type = 'alert' ");
		List<String> str = sql.toJSON().collectAsList();
		List list = new ArrayList<JSONObject>();
		str.forEach(elem->list.add(new JSONObject(elem)));
		CrudHBase cj = new CrudHBase();
		cj.addBatch("eve_http",	list);
	}

	@Override
	public void fileSparkSqlToClickhouse() {
		// TODO Auto-generated method stub
		Dataset<org.apache.spark.sql.Row> jsons = ss.read().format("json").load("C:\\java\\file");
		jsons.createOrReplaceGlobalTempView("Temp");
		Dataset<org.apache.spark.sql.Row> sql = ss.sql("select * from global_temp.Temp where event_type = 'alert' ");
		List<String> str = sql.toJSON().collectAsList();
		List list = new ArrayList<JSONObject>();
		str.forEach(elem->list.add(new JSONObject(elem)));
		//CrudClickhouse cj = new CrudClickhouse();
		//cj.addBatch("eve_http",	list);
	}

	@Override
	public void fileSparkStreamToConsole() {
		// TODO Auto-generated method stub
		StructType scheam = new StructType().add("event_time", StringType)
				.add("flow_id", LongType)
				.add("event_unixtime", TimestampType)
				.add("in_iface", StringType)
				.add("event_type", StringType)
				.add("src_ip", StringType)
				.add("src_port", IntegerType)
				.add("dest_ip", StringType)
				.add("dest_port", StringType)
				.add("proto", StringType)
				.add("tx_id", IntegerType)
				.add("alert", new StructType()
						.add("action", StringType)
						.add("gid", IntegerType)
						.add("qr_id", StringType)
						.add("signature_id", LongType)
						.add("rev", IntegerType)
						.add("signature", StringType)
						.add("category", StringType)
						.add("severity", IntegerType)
						.add("tx_id", IntegerType)
						.add("cm_id", StringType))
				.add("http", new StructType()
						.add("hostname", StringType)
						.add("url", StringType)
						.add("http_user_agent", StringType)
						.add("http_content_type", StringType)
						.add("http_method", StringType)
						.add("protocol", StringType)
						.add("status", IntegerType)
						.add("length", IntegerType)
						.add("http_request_headers", StringType)
						.add("http_request_headers_len", IntegerType)
						.add("http_response_headers", StringType)
						.add("http_response_headers_len", IntegerType)
						)
				.add("fileinfo", new StructType().add("fstorepath", StringType))
				.add("src_country", StringType)
				.add("src_city", StringType)
				.add("src_ip_val", LongType)
				.add("dest_ip_val", LongType)
				.add("ip_type", IntegerType)
				.add("clusterid", StringType);
		Dataset<org.apache.spark.sql.Row> jsons = ss.readStream().schema(scheam).format("json").load("file:///usr/test");
		try {
			jsons.createOrReplaceGlobalTempView("Temp");
			Dataset<org.apache.spark.sql.Row> sql = ss.sql("select * from global_temp.Temp where event_type = 'alert' ");
			sql.writeStream()
		    //.foreach(new HBaseSink())
			.foreachBatch((a, b)->{
				List<String> list = a.toJSON().collectAsList(); 
				List arr = new ArrayList<JSONObject>();
				list.forEach(elem->arr.add(new JSONObject(elem)));
			})
		    .format("console")
		    .start()
		    .awaitTermination();
	}catch (Exception e) {
		// TODO Auto-generated catch block
		e.printStackTrace();
	}
	}
	@Override
	public void fileSparkStreamToHbase() {
		// TODO Auto-generated method stub
		StructType scheam = new StructType().add("event_time", StringType)
				.add("flow_id", LongType)
				.add("event_unixtime", TimestampType)
				.add("in_iface", StringType)
				.add("event_type", StringType)
				.add("src_ip", StringType)
				.add("src_port", IntegerType)
				.add("dest_ip", StringType)
				.add("dest_port", StringType)
				.add("proto", StringType)
				.add("tx_id", IntegerType)
				.add("alert", new StructType()
						.add("action", StringType)
						.add("gid", IntegerType)
						.add("qr_id", StringType)
						.add("signature_id", LongType)
						.add("rev", IntegerType)
						.add("signature", StringType)
						.add("category", StringType)
						.add("severity", IntegerType)
						.add("tx_id", IntegerType)
						.add("cm_id", StringType))
				.add("http", new StructType()
						.add("hostname", StringType)
						.add("url", StringType)
						.add("http_user_agent", StringType)
						.add("http_content_type", StringType)
						.add("http_method", StringType)
						.add("protocol", StringType)
						.add("status", IntegerType)
						.add("length", IntegerType)
						.add("http_request_headers", StringType)
						.add("http_request_headers_len", IntegerType)
						.add("http_response_headers", StringType)
						.add("http_response_headers_len", IntegerType)
						)
				.add("fileinfo", new StructType().add("fstorepath", StringType))
				.add("src_country", StringType)
				.add("src_city", StringType)
				.add("src_ip_val", LongType)
				.add("dest_ip_val", LongType)
				.add("ip_type", IntegerType)
				.add("clusterid", StringType);
		Dataset<org.apache.spark.sql.Row> jsons = ss.readStream().schema(scheam).format("json").load("file:///opt/test");
		try {
			jsons.createOrReplaceGlobalTempView("Temp");
			Dataset<org.apache.spark.sql.Row> sql = ss.sql("select * from global_temp.Temp where event_type = 'alert' ");
			sql.writeStream()
		    //.foreach(new HBaseSink())
			.foreachBatch((a, b)->{
				List<String> list = a.toJSON().collectAsList(); 
				List arr = new ArrayList<JSONObject>();
				list.forEach(elem->arr.add(new JSONObject(elem)));
				CrudHBase ch = new CrudHBase();
				arr.forEach(elem->System.out.println(elem));
				
				ch.addBatch("eve_http", arr);
			})
		    //.format("console")
		    .outputMode("append")
		    .start()
			.awaitTermination();
	}catch (Exception e) {
		// TODO Auto-generated catch block
		e.printStackTrace();
	}
	}

	@Override
	public void fileSparkStreamToClickhouse()  {
		// TODO Auto-generated method stub
		StructType scheam = new StructType().add("event_time", StringType)
				.add("flow_id", LongType)
				.add("event_unixtime", TimestampType)
				.add("in_iface", StringType)
				.add("event_type", StringType)
				.add("src_ip", StringType)
				.add("src_port", IntegerType)
				.add("dest_ip", StringType)
				.add("dest_port", StringType)
				.add("proto", StringType)
				.add("tx_id", IntegerType)
				.add("alert", new StructType()
						.add("action", StringType)
						.add("gid", IntegerType)
						.add("qr_id", StringType)
						.add("signature_id", LongType)
						.add("rev", IntegerType)
						.add("signature", StringType)
						.add("category", StringType)
						.add("severity", IntegerType)
						.add("tx_id", IntegerType)
						.add("cm_id", StringType))
				.add("http", new StructType()
						.add("hostname", StringType)
						.add("url", StringType)
						.add("http_user_agent", StringType)
						.add("http_content_type", StringType)
						.add("http_method", StringType)
						.add("protocol", StringType)
						.add("status", IntegerType)
						.add("length", IntegerType)
						.add("http_request_headers", StringType)
						.add("http_request_headers_len", IntegerType)
						.add("http_response_headers", StringType)
						.add("http_response_headers_len", IntegerType)
						)
				.add("fileinfo", new StructType().add("fstorepath", StringType))
				.add("src_country", StringType)
				.add("src_city", StringType)
				.add("src_ip_val", LongType)
				.add("dest_ip_val", LongType)
				.add("ip_type", IntegerType)
				.add("clusterid", StringType);
		Dataset<org.apache.spark.sql.Row> jsons = ss.readStream().schema(scheam).format("json").load("C:\\java\\file");
		try {
			jsons.createOrReplaceGlobalTempView("Temp");
			Dataset<org.apache.spark.sql.Row> sql = ss.sql("select * from global_temp.Temp where event_type = 'alert' ");
			sql.writeStream()
		    //.foreach(new HBaseSink())
			.foreachBatch((a, b)->{
				List<String> list = a.toJSON().collectAsList(); 
				List arr = new ArrayList<JSONObject>();
				list.forEach(elem->arr.add(new JSONObject(elem)));
			//	CrudClickhouse cli = new CrudClickhouse();
				//cli.addBatch("", arr);
			})
		    //.format("console")
		    .outputMode("append")
		    .start()
			.awaitTermination();
	}catch (Exception e) {
		// TODO Auto-generated catch block
		e.printStackTrace();
	}
	}
	@Override
	public void kafkaSparkStreamToClickhouse() {
		// TODO Auto-generated method stub
		
	}

	@Override
	public void kafkaSparkStreamToHbase() {
		// TODO Auto-generated method stub
		SparkSession session = SparkUtils.getSparkSession("spark");
		Dataset<org.apache.spark.sql.Row> ds = session.readStream()
		.format("kafka")
		.option("kafka.bootstrap.servers", "192.168.10.31:9092")
		.option("subscribe", "topic0")
		//.option("failOnDataLoss", false)
		.option("startingOffsets", "latest")
		.load();
		Dataset<org.apache.spark.sql.Row> res = ds.selectExpr("CAST(value AS string)");
		String str = null;
		try {
			res.writeStream()
			.queryName("query")
			.option("checkpointLocation", "file:///usr/checkpoint")
			//.option("checkpointLocation", "C://checkpoint")
			.foreachBatch((a, b)->{
				//a表示存取数据的DataSet，b表示BatchId：Long
				List<String> list = a.toJSON().collectAsList();
				list.forEach(elem->System.out.println(elem));
				List<String> mid = new ArrayList();
				list.forEach(elem->{
					String s1 = elem.replaceAll("\\\\", "");
					String s2 = s1.substring(10);
					String s3 = s2.substring(0, s2.length()-1);
					mid.add(s3);
				});
				mid.forEach(elem->System.out.println(elem));
				List<JSONObject> result = new ArrayList();
				mid.forEach(elem->{
					JSONObject json = new JSONObject(elem);
					if(json.get("event_type").equals("alert"))
					result.add(new JSONObject(elem));
				});
				//result.forEach(elem->System.out.println(elem));
				CrudHBase ch = new CrudHBase();
				ch.addBatch("eve_http", result);
			})
			.outputMode("append")
			//.format("console")
			.start()
			.awaitTermination();
		} catch (StreamingQueryException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}




	@Override
	public void kafkaSparkStreamToConsole() {
		// TODO Auto-generated method stub
		SparkSession session = SparkUtils.getSparkSession("spark");
		Dataset<org.apache.spark.sql.Row> ds = session.readStream()
		.format("kafka")
		.option("kafka.bootstrap.servers", "192.168.10.31:9092")
		.option("subscribe", "topic0")
		//.option("failOnDataLoss", false)
		.option("startingOffsets", "latest")
		.load();
		Dataset<org.apache.spark.sql.Row> res = ds.selectExpr("CAST(value AS string)");
		try {
			res.writeStream()
			.queryName("query")
			.option("checkpointLocation", "D://checkpoint")
			.outputMode("append")
			.format("console")
			.start()
			.awaitTermination();
		} catch (StreamingQueryException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
	
}

稷下小鲤鱼

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Structured Streaming

Structured Streaming是Spark新提出的一种实时流的框架，它和Spark Streaming之间有什么区别呢？一、为什么要有StructedStreaming？Spark Streaming是Spark核心API的一个扩展，可以实现高吞吐量的、具备容错机制的实时流数据的处理。它支持从多种数据源获取数据，包括Kafka、Flume、以及TCP socket等，从数据源获取数据之后，可以使用诸如map、reduce和window等高级函数进行复杂算法的处理。最后，还可以将处理结果存储到
复制链接

扫一扫

专栏目录