RDD数据源-★★
- 下图就展示了RDD支持的多种数据源,但是API很难用,后续可以封装,SparkSQL就简化了很多
- 所以这里就演示一下RDD操作JDBC(练习RDDAPI, 后续SparkSQL操作JDBC很简单)
- 注意:引入mysql驱动
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.38</version>
</dependency>
- 代码实现
package cn.hanjiaxiaozhi.core
import java.sql.{Connection, DriverManager, PreparedStatement, ResultSet}
import org.apache.spark.rdd.{JdbcRDD, RDD}
import org.apache.spark.{SparkConf, SparkContext}
/**
* Author hanjiaxiaozhi
* Date 2020/7/23 11:16
* Desc 演示Spark操作JDBC
CREATE TABLE `t_student` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`name` varchar(255) DEFAULT NULL,
`age` int(11) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
*/
object JDBCDataSourceTest {
def main(args: Array[String]): Unit = {
//0.创建sc-Spark执行环境
val conf: SparkConf = new SparkConf().setAppName("wc").setMaster("local[*]") //虽然是local,但是运行机制和集群一样,是用本地线程模拟集群
val sc: SparkContext = new SparkContext(conf)
sc.setLogLevel("WARN")
//0.准备数据
val dataRDD: RDD[(String, Int)] = sc.parallelize(List(("jack", 18), ("tom", 19), ("rose", 20)))
//1.使用SparkAPI将数据写入MySQL
//我们的目标是将RDD中的每一条数据写入MySQL,那么也就是要对RDD的数据进行遍历再写入MySQL,该使用foreach还是foreachPartition?
/*
rdd.foreach(s=>{
//开启连接
//保存数据
//关闭连接
})
rdd.foreachPartition(partition=>{
//开启连接
partition.foreach(s=>{
//保存数据
})
//关闭连接
})
*/
/*dataRDD.foreachPartition(p=>{
//0.加载驱动--可以省略,源码中自动加载了
//1.获取连接
val conn: Connection = DriverManager.getConnection("jdbc:mysql://localhost:3306/bigdata?characterEncoding=UTF-8","root","root")
//2.准备sql语句
val sql:String = "INSERT INTO `t_student` (`id`, `name`, `age`) VALUES (NULL, ?, ?);"
//3.获取预编译语句对象
val ps: PreparedStatement = conn.prepareStatement(sql)
p.foreach(t=>{
val name: String = t._1
val age: Int = t._2
//4.设置参数
ps.setString(1,name)
ps.setInt(2,age)
//5.执行sql
ps.executeUpdate()
})
//6.关闭连接
conn.close()
ps.close()
})*/
//dataRDD.foreachPartition(p=>data2MySQL(p))
//dataRDD.foreachPartition(data2MySQL(_))
//dataRDD.foreachPartition(data2MySQL _)
dataRDD.foreachPartition(data2MySQL)
//2.使用SparkAPI从MySQL中读取数据
/*
sc: SparkContext,
getConnection: () => Connection,
sql: String,
lowerBound: Long,//下界
upperBound: Long,//上界
numPartitions: Int,//分区数
mapRow: (ResultSet) => T = JdbcRDD.resultSetToObjectArray _) //封装结果集的函数
*/
val getConnection = ()=> DriverManager.getConnection("jdbc:mysql://localhost:3306/bigdata?characterEncoding=UTF-8","root","root")
val sql:String = "select id,name,age from t_student where id >= ? and id <=?"
val mapRow = (res:ResultSet)=>{
//从结果集中取出id,name,age组成元组并返回
val id: Int = res.getInt("id")
val name: String = res.getString("name")
val age: Int = res.getInt("age")
(id,name,age)
}
val studentRDD: JdbcRDD[(Int, String, Int)] = new JdbcRDD[(Int, String, Int)](
sc,
getConnection,
sql,
3,
6,
2,
mapRow
)
studentRDD.collect().foreach(println)
}
def data2MySQL(p: Iterator[(String, Int)]): Unit = {
//0.加载驱动--可以省略,源码中自动加载了
//1.获取连接
val conn: Connection = DriverManager.getConnection("jdbc:mysql://localhost:3306/bigdata?characterEncoding=UTF-8","root","root")
//2.准备sql语句
val sql:String = "INSERT INTO `t_student` (`id`, `name`, `age`) VALUES (NULL, ?, ?);"
//3.获取预编译语句对象
val ps: PreparedStatement = conn.prepareStatement(sql)
p.foreach(t=>{
val name: String = t._1
val age: Int = t._2
//4.设置参数
ps.setString(1,name)
ps.setInt(2,age)
//5.执行sql
ps.executeUpdate()
})
//6.关闭连接
conn.close()
ps.close()
}
}