一:版本预备:
Scala :2.11 版
Spark: 2.2.3 版
二:版本之间的差异: Spark 2.x 版中将使用Dataset 取代Spark 1.x 版 DataFrame
三:项目工程搭建
构建一个SpringInitializr 项目
在pom.xml 文件中添加依赖
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.2.3</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
<exclusion>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.2.3</version>
</dependency>
<dependency>
<groupId>org.quartz-scheduler</groupId>
<artifactId>quartz</artifactId>
<version>2.3.0</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-context-support</artifactId>
<version>5.0.7.RELEASE</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-hive -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>2.2.3</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.scala-lang/scala-library -->
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>2.11.6</version>
</dependency>
<!-- https://mvnrepository.com/artifact/log4j/log4j -->
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.17</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-jdbc</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-webflux</artifactId>
<exclusions>
<exclusion>
<groupId>org.codehaus.janino</groupId>
<artifactId>janino</artifactId>
</exclusion>
<exclusion>
<groupId>org.codehaus.janino</groupId>
<artifactId>commons-compiler</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-devtools</artifactId>
<optional>true</optional>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
</dependency>
说明: SpringBoot 项目中默认集成了一些配置 所以在集成Spark 时 要在依赖包去除依赖 否者启动报错
四:配置SparkSQL
@Configuration
public class SparkContextConf {
private String appName = "yourAppName";
private String master = "local[4]"; //使用Local 模式部署
//Spark 配置
@Bean
public SparkConf sparkConf() {
SparkConf conf = new SparkConf().setAppName(appName).setMaster(master);
//这是数据分区
conf.set("spark.sql.shuffle.partitions", "4");
//设置序列化机制
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
//设置执行内存
conf.set("spark.executor.memory", "1g");
conf.set("spark.driver.memory", "4g");
conf.set("spark.kryoserializer.buffer.max", "1024m");
//设置超时时间
conf.set("spark.network.timeout", "3600s");
conf.set("spark.debug.maxToStringFields", "100000");
conf.set("spark.shuffle.consolidateFiles","true");
//设置JVM 参数 (9千万级数据一次读取到内存需要9G,此参数根据具体数据配置)
conf.set("spark.driver.extraJavaOptions", "-XX:PermSize=16g -XX:MaxPermSize=16g");
// Spark 整合Elasticsearch 集群 配置
conf.set("es.nodes", clusterHost); //集群Host
conf.set("es.net.http.auth.user", clusterName); //集群用户名
conf.set("es.net.http.auth.pass", clusterPwd); //集群密码
conf.set("mapred.map.tasks.speculative.execution", "false");// 关闭mapper阶段的执行推测
conf.set("mapred.reduce.tasks.speculative.execution", "false"); //关闭reducer阶段的执行推测
conf.set("es.nodes.wan.only", "true"); // 不用域名访问 (本案例是使用域名配置该参数可以自动转换成节点的IP默认为IP)
conf.set("es.input.json", "yes");//指定输入文件的类型
conf.set("es.port", "9200");//端口
conf.set("spark.broadcast.compress", "true");// 设置广播压缩
conf.set("spark.rdd.compress", "true"); // 设置RDD压缩
conf.set("spark.io.compression.codec", "org.apache.spark.io.LZFCompressionCodec");
conf.set("spark.shuffle.file.buffer", "1024");
// 参数说明:该参数用于设置shuffle read task的buffer缓冲大小,而这个buffer缓冲决定了每次能够拉取多少数据。
// 调优建议:如果作业可用的内存资源较为充足的话,可以适当增加这个参数的大小(比如96m),从而减少拉取数据的次数,也就可以减少网络传输的次数,
// 进而提升性能。在实践中发现,合理调节该参数,性能会有1%~5%的提升。
conf.set("spark.reducer.maxSizeInFlight", "4096m");
// 未找到
conf.set("spark.reducer.maxMblnFlight", "4096m");
// Kryo:快速、高效的序列化框架,序列化后的大小比Java序列化小,速度比Java快
conf.registerKryoClasses(new Class[]{JSONObject.class, java.util.List.class, java.util.Map.class, java.util.ArrayList.class, java.util.HashMap.class});
return conf;
}
@Bean
public JavaSparkContext javaSparkContext() {
JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf());
return javaSparkContext;
}
@Bean
public SQLContext sqlContext() {
SQLContext sqlContext = new SQLContext(javaSparkContext());
return sqlContext;
}
五:使用SpringBoot 的自动注解 将 SQLContext 实例注入到 业务代码中
@Component
public class ETLServiceDemo implements ApplicationContextAware {
private static ApplicationContext context = null;
private Logger log = LoggerFactory.getLogger(ETLServiceDemo.class);
private Properties conf = BaseEnviromentConf.singletonEnvironment();
private ETLServiceDemo() {
}
@Override
public void setApplicationContext(ApplicationContext applicationContext) throws BeansException {
this.context = applicationContext;
}
public void etl() throws Exception {
//原表
Properties prod = new Properties();
prod.put("user", conf.getProperty("from_mysql_username").trim());
prod.put("password", SecurityTool.decodeAES("rules", conf.getProperty("from_mysql_password").trim()));
prod.put("driver", conf.getProperty("mysql_driver").trim());
// 落地表配置
Properties local = new Properties();
local.put("user", conf.getProperty("to_mysql_username").trim());
local.put("password", SecurityTool.decodeAES("rules", conf.getProperty("to_mysql_password").trim()));
local.put("driver", conf.getProperty("mysql_driver").trim());
try {
//获取SparkSession
SparkSession session = context.getBean(SparkSession.class);
writeLive(prod, local, session);
} catch (Exception e) {
System.out.println(e.getMessage());
}
}
public void writeLive(Properties prod, Properties local, SparkSession session) {
long start = System.currentTimeMillis();
//读取table为prod 所配置的数据库中的your_table_name表并持久化到内存中映射一张临时表
/**
* @desc 如果内存不够大此时 spark 会溢写到磁盘 默认是这个持久化模式
*
*/
Dataset d1 = session.read().option(JDBCOptions.JDBC_BATCH_FETCH_SIZE(), 1000000).jdbc(conf.getProperty("from_mysql_url"), "your_table_name", prod)
.selectExpr("column1", "column2", "column3", "column4")
.persist(StorageLevel.MEMORY_ONLY_SER());
d1.registerTempTable("l1");
//从临时表中读取数据
Dataset d2 = session.sql("your sql expr");
//讲读取到内存的表处理后再次写到mysql中 local 配置为 目的库配置
d2.write().mode(SaveMode.Append).option(JDBCOptions.JDBC_BATCH_INSERT_SIZE(), 1000000).jdbc(conf.getProperty("to_mysql_url"), "your_table_name", local);
long end = System.currentTimeMillis();
log.info("耗时--->>>>" + (end - start) / 1000L);
//运行完后释放内存
d1.unpersist(true);
d2.unpersist(true);
}