SparkSQL 学习笔记----JDBC数据源实战

12 篇文章 0 订阅
1、使用jdbc数据源

Spark SQL支持使用JDBC从关系型数据库中读取数据。读取的数据又DataFrame表示,可以很方便地使用Spark Core提供的各种算子进行处理。

  • java版本
Map<String,String> options = new HashMap<String,String>();
options.put("url","jdbc:mysql://spark1:3306/testdb");
options.put("dbtable","students");
DataFrame jdbcDF = sqlContext.read().format("jdbc").options(options).load();
  • scala版本
val jdbcDF = sqlContext.read.format("jdbc").options(
Map("url" -> "jdbc:mysql://spark1:3306/testdb",
"dbtable" -> "students")).load()
2、实践:查询分数大于80分的学生信息
package pz.spark.study.sql;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import scala.Tuple2;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.Statement;
import java.sql.Struct;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class JDBCDataSource {
    public static void main(String[] args) {
        SparkConf conf = new SparkConf().setAppName("JDBCDataSource").setMaster("local");
        JavaSparkContext sc = new JavaSparkContext(conf);
        SQLContext sqlContext = new SQLContext(sc);
        //分别将mysql中两张表的数据加载为DataFrame
        Map<String, String> options = new HashMap<String, String>();
        options.put("url", "jdbc:mysql://spark1:3306/testdb");
        options.put("dbtable", "student_infos");
        Dataset<Row> studentInfosDF = sqlContext.read().format("jdbc").options(options).load();

        options.clear();
        options.put("url", "jdbc:mysql://spark1:3306/testdb");
        options.put("dbtable", "student_scores");
        Dataset<Row> studentScoresDF = sqlContext.read().format("jdbc").options(options).load();

        //将两个DataFrame转换为JavaPairRDD,执行join操作
        JavaPairRDD<String, Tuple2<Integer, Integer>> studentsRDD = studentInfosDF.javaRDD().mapToPair(new PairFunction<Row, String, Integer>() {
            @Override
            public Tuple2<String, Integer> call(Row row) throws Exception {
                return new Tuple2<String, Integer>(row.getString(0), Integer.valueOf(String.valueOf(row.get(1))));
            }
        }).join(studentScoresDF.javaRDD().mapToPair(new PairFunction<Row, String, Integer>() {
            @Override
            public Tuple2<String, Integer> call(Row row) throws Exception {
                return new Tuple2<String, Integer>(String.valueOf(row.get(0)), Integer.valueOf(String.valueOf(row.get(1))));
            }
        }));

        //将JavaPairRDD转换为JavaRDD<Row>
        JavaRDD<Row> studentRowsRDD = studentsRDD.map(new Function<Tuple2<String, Tuple2<Integer, Integer>>, Row>() {
            @Override
            public Row call(Tuple2<String, Tuple2<Integer, Integer>> stringTuple2Tuple2) throws Exception {
                return RowFactory.create(stringTuple2Tuple2._1, stringTuple2Tuple2._2._1, stringTuple2Tuple2._2._2);
            }
        });
        //过滤出分数>80分的数据
        JavaRDD<Row> filteredStudentRowsRDD = studentRowsRDD.filter(new Function<Row, Boolean>() {
            @Override
            public Boolean call(Row row) throws Exception {
                if (row.getInt(2) > 80) {
                    return true;
                } else {
                    return false;
                }
            }
        });

        //转换为DataFrame
        List<StructField> structFields = new ArrayList<StructField>();

        structFields.add(DataTypes.createStructField("name",DataTypes.StringType,true));
        structFields.add(DataTypes.createStructField("age",DataTypes.IntegerType,true));
        structFields.add(DataTypes.createStructField("score",DataTypes.IntegerType,true));

        StructType structType = DataTypes.createStructType(structFields);
        Dataset<Row> studentsDF = sqlContext.createDataFrame(filteredStudentRowsRDD, structType);

        Row[] rows  = studentsDF.collect();
        for (Row row :rows) {
            System.out.println(row);
        }
        //将DataFrame中的数据保存到mysql表中
        /*options.put("dbtable", "good_student_infos");

        studentsDF.write().format("jdbc").options(options).save();*/
        //这种方式是企业中较常用的,有可能是插入mysql\hbase\redis缓存
        studentsDF.javaRDD().foreach(new VoidFunction<Row>() {
            @Override
            public void call(Row row) throws Exception {
                String sql = "insert into good_student_infos values("
                        +"'" + String.valueOf(row.getString(0)) + "',"
                        +Integer.valueOf(String.valueOf(row.get(1)))+ ","
                        +Integer.valueOf(String.valueOf(2)) + ")";
                Class.forName("com.mysql.jdbc.Driver");
                Connection conn = null;
                Statement stmt = null;
                try{
                    conn = DriverManager.getConnection("jdbc:mysql://spark1:3306/testb","","");
                    stmt = conn.createStatement();
                    stmt.executeUpdate(sql);
                }catch(Exception e){
                    e.printStackTrace();
                }finally{
                    if(stmt != null){
                        stmt.close();
                    }
                    if(conn != null){
                        conn.close();
                    }
                }
            }
        });
        sc.close();
    }
}
3、 总结:jdbc数据源的使用
  • 首先是通过SQLContext的read系列方法, 将mysql中的数据加载为DataFrame;
  • 然后可以将DataFrame转换为RDD,使用Spark Core提供的各种算子进行操作
  • 最后可以将得到的数据结果,通过foreach()算子,写入mysql\hbase等

本文为北风网Spark2.0培训视频的学习笔记
视频链接:
https://www.bilibili.com/video/av19995678/?p=113

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值