<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>test</artifactId>
<groupId>org.example</groupId>
<version>1.0-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>spark</artifactId>
<properties>
<!-- 其实这就是变量引用 -->
<spark.version>2.3.3</spark.version>
<scala.version>2.11.12</scala.version>
<spark.scala>2.11</spark.scala>
</properties>
<dependencies>
<!-- scala语言 -->
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<!-- spark_sql会包含所有spark核心包 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${spark.scala}</artifactId>
<version>${spark.version}</version>
</dependency>
<!-- spark和 hive的整合包 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_${spark.scala}</artifactId>
<version>${spark.version}</version>
</dependency>
</dependencies>
<build>
<plugins>
<!-- 指定编译scala的插件 -->
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>4.3.1</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
<configuration>
<args>
<arg>-dependencyfile</arg>
<arg>${project.build.directory}/.scala_dependencies</arg>
</args>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
package com.ws
import java.sql.{DriverManager, ResultSet}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.JdbcRDD
object SparkReadMysql {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setAppName("spark加载mysql数据")
.setMaster("local")
val sc = new SparkContext(conf)
val getConn = () => DriverManager.getConnection(
"jdbc:mysql://localhost:3306/dynamic_rule", "root", "root")
val resMapping = (rs: ResultSet) => {
val id = rs.getInt(1)
val name = rs.getString(2)
val url = rs.getString(3)
val perms = rs.getString(4)
(id, name, url, perms)
}
/*
new JdbcRDD(sc,getConnection(),sql,lower,upper,numpt,resultMapFunc)
sc: SparkContext对象
getConnection(): 一个返回数据库连接的函数
sql: sql脚本,因为他要分区进行处理,所以sql中需要默认提供一个splitPK,并用占位符指定范围,用于分割 (menu_id> ? and menu_id<?),而且这个字段必须是整形
lower: 下界
lower: 上界
numpt: 分区数
resultMapFunc:resultSet转换函数
*/
val rdd: JdbcRDD[(Int, String, String, String)] = new JdbcRDD[(Int, String, String, String)](sc,
getConn,
"select menu_id,name,url,perms from sys_menu where menu_id> ? and menu_id<?",
0, 10, 1,
resMapping)
rdd.foreach(println)
sc.stop()
}
}
创建一个JdbcRdd,这个JdbcRdd是rdd的子类,需要7个参数,有一定局限性
/*
new JdbcRDD(sc,getConnection(),sql,lower,upper,numpt,resultMapFunc)
sc: SparkContext对象
getConnection(): 一个返回数据库连接的函数
sql: sql脚本,因为他要分区进行处理,所以sql中需要默认提供一个splitPK,并用占位符指定范围,用于分割 (menu_id> ? and menu_id<?),而且这个字段必须是整形
lower: 下界
lower: 上界
numpt: 分区数
resultMapFunc:resultSet转换函数
*/