1.导入jar 包
Scala安装目录下的lib包
2.编写简单统计代码
import org.apache.spark.*;
import org.apache.spark.api.java.function.*;
import org.apache.spark.streaming.*;
import org.apache.spark.streaming.api.java.*;
import scala.Tuple2;
import java.util.Arrays;
public class sparkStreamingUnit {
public static void main(String[] args) throws Exception {
System.out.println("Spark Streaming Unit Test!");
// Create a local StreamingContext with two working thread and batch interval of 1 second
//SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("SparkStreamingWordCountDemo");
SparkConf conf = new SparkConf().setMaster("yarn").setAppName("SparkStreamingWordCountDemo");
JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(5));
jssc.sparkContext().setLogLevel("ERROR");
// Create a DStream that will connect to hostname:port, like localhost:9999
JavaReceiverInputDStream<String> lines = jssc.socketTextStream("192.168.32.132", 9999);
// Split each line into words
JavaDStream<String> words = lines.flatMap(x -> Arrays.asList(x.split(" ")).iterator());
// Count each word in each batch
JavaPairDStream<String, Integer> pairs = words.mapToPair(s -> new Tuple2<>(s, 1));
JavaPairDStream<String, Integer> wordCounts = pairs.reduceByKey((i1, i2) -> i1 + i2);
// Print the first ten elements of each RDD generated in this DStream to the console
wordCounts.print();
jssc.start(); // Start the computation
jssc.awaitTermination(); // Wait for the computation to terminate
jssc.stop();
}
}
3.编写submit shell脚本,并上传至spark集群
4.运行sparkstreaming 和netcat 程序
//CBO优化
sparkConf.set("spark.sql.cbo.enabled","true")
sparkConf.set("spark.sql.cbo.joinReorder.enabled","true")
sparkConf.set("spark.sql.statistics.histogram.enabled","true")
//自适应查询优化(2.4版本之后)
sparkConf.set("spark.sql.adaptive.enabled","true")
//开启consolidateFiles
sparkConf.set("spark.shuffle.consolidateFiles","true")
//设置并行度
sparkConf.set("spark.default.parallelism","150")
//设置数据本地化等待时间
sparkConf.set("spark.locality.wait","6s")
//设置mapTask写磁盘缓存
sparkConf.set("spark.shuffle.file.buffer","64k")
//设置byPass机制的触发值
sparkConf.set("spark.shuffle.sort.bypassMergeThreshold","1000")
//设置resultTask拉取缓存
sparkConf.set("spark.reducer.maxSizeInFlight","48m")
//设置重试次数
sparkConf.set("spark.shuffle.io.maxRetries","10")
//设置重试时间间隔
sparkConf.set("spark.shuffle.io.retryWait","10s")
//设置reduce端聚合内存比例
sparkConf.set("spark.shuffle.memoryFraction","0.5")
//设置序列化
sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
//设置自动分区
sparkConf.set("spark.sql.auto.repartition","true")
//设置shuffle过程中分区数
sparkConf.set("spark.sql.shuffle.partitions","500")
//设置自动选择压缩码
sparkConf.set("spark.sql.inMemoryColumnarStorage.compressed","true")
//关闭自动推测分区字段类型
sparkConf.set("spark.sql.source.partitionColumnTypeInference.enabled","false")
//设置spark自动管理内存
sparkConf.set("spark.sql.tungsten.enabled","true")
//执行sort溢写到磁盘
sparkConf.set("spark.sql.planner.externalSort","true")
//增加executor通信超时时间
sparkConf.set("spark.executor.heartbeatInterval","60s")
//cache限制时间
sparkConf.set("spark.dynamicAllocation.cachedExecutorIdleTimeout","120")
//设置广播变量
sparkConf.set("spark.sql.autoBroadcastJoinThreshold","104857600")
//其他设置
sparkConf.set("spark.sql.files.maxPartitionBytes","268435456")
sparkConf.set("spark.sql.files.openCostInBytes","8388608")
sparkConf.set("spark.debug.maxToStringFields","500")
//推测执行机制
sparkConf.set("spark.speculation","true")
sparkConf.set("spark.speculation.interval","500")
sparkConf.set("spark.speculation.quantile","0.8")
sparkConf.set("spark.speculation.multiplier","1.5")