sparkstreaming + sparksql实现ETL操作

代码磨了半天做个记录(删了业务相关的代码):
spark-2.4.0

Oracle2ODPS例子:

import aliyun.spark.test.odps2oracle.util.MessageMapperUtilOracle;
import aliyun.spark.test.util.PropertiesUtil;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;

/**
 * 读取odps上的数据,写入oracle数据库
 * args[0] 运行模式,local或cluster
 * args[1] odps源表名
 * args[2] oracle结果表名
 * args[3] numPartitions 分区数量
 */
public class Oracle2ODPS {

   
    public static void main(String[] args) {

        System.out.println("args[]:" + Arrays.toString(args));
        SparkSession ss;
        if(args[0] != null && args[0].equals("local")){
            // 本地执行
            // 需设置spark.master为local[N]才能直接运行,N为并发数
            ss = SparkSession.builder().appName("ORACLE_TO_ODPS").config("spark.master", "local[2]")
                    .config("spark.hadoop.odps.project.name", "xxxxx")
                    .config("spark.hadoop.odps.access.id", "xxxxx")
                    .config("spark.hadoop.odps.access.key", "xxxxx")
                    .config("spark.hadoop.odps.end.point", "xxxxx")
                    .config("spark.hadoop.tunnel.end.point","xxxxx")
                    .config("spark.sql.catalogImplementation", "odps")
                    .config("spark.testing.memory", 2147480000)
                    .config("odps.exec.dynamic.partition.mode","nonstrict")
                    .config("spark.hadoop.fs.oss.accessKeyId","")
                    .config("spark.hadoop.fs.oss.accessKeySecret","")
                    .config("spark.hadoop.fs.oss.endpoint","oss-cn-beijing.aliyuncs.com")
                    .getOrCreate();
        }else {
            //集群执行
            ss = SparkSession.builder().appName("ORACLE_TO_ODPS").config("odps.exec.dynamic.partition.mode","nonstrict").getOrCreate();
        }
     

     
        // 读oracle数据
        Dataset<Row> jdbcJcDF = ss.read().format("jdbc")
                .option("driver", PropertiesUtil.getProperty("oracle.driver"))
                .option("url", PropertiesUtil.getProperty("oracle.url"))
                .option("user", PropertiesUtil.getProperty("oracle.user"))
                .option("password", PropertiesUtil.getProperty("oracle.password"))
                .option("dbtable", "(select  MOD(ASCII(SUBSTR(ROWID, -1)), 20) as RN ,q.* from " + oracleTable +" q)")
                .option("numPartitions",4)
                .option("partitionColumn", "RN")
                .option("lowerBound", 0)
                .option("upperBound", 20)
                .option("fetchsize", "10000").load();

       // jdbcDF.show();
        System.out.println("开始写入...");

        // 写分区表
        jdbcJcDF.createOrReplaceTempView(oracleTable + "_TMP");
        ss.sql("insert overwrite table XXX_ALL  partition (DS='" + ds + "') select " + allColume + " from " + oracleTable + "_TMP");

        ss.close();
        System.out.println("spark run over!");
    }




    public static String fieldConvertToOdpsField(String name, String dataType) {
     // 代码删除了
    }

    /**
     * 创建odps上分区表
     * SparkSession ss
     * String database 指定数据库
     * String mapperName 指定mapper对应的表
     * Boolean flag 是否删表重建
     * @author Administrator
     */
    public static String createOdpsTableAll(SparkSession ss, String database, String mapperName, Boolean flag){
   // 代码删除了
      
    }


    public static String createOdpsETLTableAll(SparkSession ss, String database, String mapperName, Boolean flag){
     // 代码删除了
    }


    private static String calculateDateString(String dateString , int dayNum , String dateFormat){
        Calendar calendar = Calendar.getInstance();
        SimpleDateFormat formatter = new SimpleDateFormat(dateFormat);
        Date date = null ;
        try {
            date = formatter.parse(dateString);
        } catch (ParseException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        calendar.setTime(date);
        calendar.add(Calendar.DAY_OF_MONTH, dayNum);
        return formatter.format(calendar.getTime());
    }

}


Kafka2ODPS例子:


import aliyun.spark.test.util.SparkSessionSingleton;
import com.alibaba.fastjson.JSONObject;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka010.ConsumerStrategies;
import org.apache.spark.streaming.kafka010.KafkaUtils;
import org.apache.spark.streaming.kafka010.LocationStrategies;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;

/**
 *   这个代码比较全点, 删了一些业务相关的
 */
public class SparkKafka2Odps {


    public static void main(String[] args) throws InterruptedException {

  

        Date nowDate = new Date();
        SimpleDateFormat nowFormat = new SimpleDateFormat("yyyyMMdd");
        String sysDate = nowFormat.format(nowDate);
        String ds = calculateDateString(sysDate, 0, "yyyyMMdd");

        SparkConf conf = new SparkConf().setAppName("sparkStreaming-kafka-odps")
                .set("spark.driver.allowMultipleContexts","true")
                .set("spark.task.maxFailures","10")
                .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
                .set("spark.hadoop.odps.project.name", "xxx\")
                .set("spark.hadoop.odps.access.id", "xxx")
                .set("spark.hadoop.odps.access.key", "xxx")
                .set("spark.hadoop.odps.end.point", "xxx")
                .set("spark.hadoop.tunnel.end.point","xxx")
                .set("spark.sql.catalogImplementation", "odps")
                .set("odps.exec.dynamic.partition.mode","nonstrict")
                .set("spark.hadoop.fs.oss.accessKeyId","")
                .set("spark.hadoop.fs.oss.accessKeySecret","")
                .set("spark.hadoop.fs.oss.endpoint","oss-cn-beijing.aliyuncs.com")
                .setMaster("local[1]");

        JavaStreamingContext streamingContext = new JavaStreamingContext(conf, Durations.seconds(1));

        streamingContext.sparkContext().setLogLevel("WARN");

        String schemaString = createOdpsTable(SparkSessionSingleton.getInstance("local"), "", mapperName,odpsTableName, true);

       // System.out.println(schemaString);

        Map<String, Object> kafkaParams = new HashMap<>();
        kafkaParams.put("bootstrap.servers", "xxx:9092,xxx:9092,xxx:9092");
        kafkaParams.put("key.deserializer", StringDeserializer.class);
        kafkaParams.put("value.deserializer", StringDeserializer.class);
        kafkaParams.put("group.id", "xxxxxx");
        kafkaParams.put("auto.offset.reset", "earliest");
        //  kafkaParams.put("auto.offset.reset", "latest");
        kafkaParams.put("enable.auto.commit", false);

        Set<String> topicsSet = new HashSet<>(Arrays.asList("xxxxxx"));


        final JavaInputDStream<ConsumerRecord<String, String>> stream =
                KafkaUtils.createDirectStream(
                        streamingContext,
                        LocationStrategies.PreferConsistent(),
                        ConsumerStrategies.<String, String>Subscribe(Arrays.asList(topicsSet.toArray(new String[0])), kafkaParams)
                );

        JavaDStream<ConsumerRecord<String, String>> repartition = stream.repartition(6);

        JavaDStream<String> lines = repartition.map(ConsumerRecord::value);

        OracleTableMapper tableMapperByOracle = MessageMapperUtilOracle.getTableMapper("", mapperName);
        List<OracleColumnMapper> columns = tableMapperByOracle.getColumns();
        List<StructField> fields = new ArrayList<>();
        for (OracleColumnMapper c : columns){
            StructField field = DataTypes.createStructField(c.getName(), fieldConvertToSparkField(c.getDataType()), true);
            fields.add(field);
        }

        StructType schema = DataTypes.createStructType(fields);

        StructField[] filedArray = schema.fields();

        String[] split = new String[filedArray.length];


        for (int i = 0; i < filedArray.length; i++) {
            StructField structField = filedArray[i];
            String name = structField.name();
            split[i] = name;
        }


        // Convert RDDs of the words DStream to DataFrame and run SQL query
        /*************START****************/
        lines.foreachRDD((rdd,time)->{
            // Get the singleton instance of SparkSession
            SparkSession spark = SparkSessionSingleton.getInstance("local");

            JavaRDD<Row> rowRDD = rdd.filter( str -> {
                if("".equals(str) || str == null){
                    return false;
                }
                return true;
            }).map((Function<String, Row>) record -> {

 //               System.out.println(record);
                Map map = (Map)JSONObject.parse(record);

                Object[] objects = new Object[filedArray.length];

                for (int i = 0; i < objects.length; i++) {
                    objects[i] =  String.valueOf(map.get(split[i].toLowerCase()));
                }

                return   RowFactory.create(objects);
            });


            Dataset<Row> dataFrame = spark.createDataFrame(rowRDD, schema);

           // dataFrame.printSchema();



            dataFrame.createOrReplaceTempView(mapperName+"_TMP");

            //Dataset<Row>   wordCountsDataFrame =  spark.sql("select * from " + mapperName+"_TMP limit 10 ");
            spark.sql("insert into table  " + odpsTableName + "  partition (DS='" + ds + "') select * from "+mapperName+"_TMP");

            //wordCountsDataFrame.show();
            System.out.println("========= " + time + "=========");
        });

        /*************END****************/
        streamingContext.start();
        streamingContext.awaitTermination();
    }




    /**
     * 创建odps上分区表
     * SparkSession ss
     * String database 指定数据库
     * String mapperName 指定mapper对应的表
     * Boolean flag 是否删表重建
     * @author Administrator
     */
    public static String createOdpsTable(SparkSession ss, String database, String mapperName,String odpsTableName, Boolean flag){

        String allColume = "";

        OracleTableMapper tableMapperByOracle = MessageMapperUtilOracle.getTableMapper(database,mapperName);
        if(tableMapperByOracle!=null){
            List<OracleColumnMapper> columns = tableMapperByOracle.getColumns();
            String str1 = "CREATE TABLE  IF NOT EXISTS " + odpsTableName + " (";
            String str2 = "";
            for (OracleColumnMapper c : columns){
                str2 += c.getName() + " " +  fieldConvertToOdpsField(c.getName(),c.getDataType()) + ", ";
                allColume +=  c.getName() + ", ";
            }
            String str3 = str2.substring(0, str2.length() -2);
            String str4 = ") PARTITIONED BY (DS STRING)";
            String str5 = str1 + str3 + str4;

            System.out.println(mapperName + "建表语句: " + str5);

            allColume =  allColume.substring(0, allColume.length() -2);

            // 是否删表
            if (flag) {
                ss.sql("drop table " + odpsTableName);
            }
            Dataset<Row> sql = ss.sql(str5);

        }
        return allColume;
    }


    public static String fieldConvertToOdpsField(String name, String dataType) {
        String str = null;
        if("NUMBER".equals(dataType)) {
            str = "BIGINT";
        }else if("VARCHAR2".equals(dataType)) {
            str = "STRING";
        }else if("DATE".equals(dataType)) {
            str = "STRING";
        }else{
            str = "DOUBLE";
        }
        return str;
    }

    public static DataType fieldConvertToSparkField(String name) {
        if("NUMBER".equals(name)) {
            return DataTypes.StringType;
        }else if("VARCHAR2".equals(name)) {
            return DataTypes.StringType;
        }else if("DATE".equals(name)) {
          //  return DataTypes.DateType;
            return DataTypes.StringType;
        } else if("DECIMAL".equals(name)) {
            //  return DataTypes.DateType;
            return DataTypes.StringType;
        }
        else{
            return DataTypes.StringType;
        }
    }


    private static String calculateDateString(String dateString , int dayNum , String dateFormat){
        Calendar calendar = Calendar.getInstance();
        SimpleDateFormat formatter = new SimpleDateFormat(dateFormat);
        Date date = null ;
        try {
            date = formatter.parse(dateString);
        } catch (ParseException e) {
            e.printStackTrace();
        }
        calendar.setTime(date);
        calendar.add(Calendar.DAY_OF_MONTH, dayNum);
        return formatter.format(calendar.getTime());
    }


}

// 单例的sparksession
public class SparkSessionSingleton {

    private static SparkSession ss;

    public static SparkSession getInstance(String flag){

        if  (ss == null){

            if("local".equals(flag)){
                ss = SparkSession.builder()
                        .config("spark.master", "local[2]")
                        .config("spark.hadoop.odps.project.name", "xxx")
                        .config("spark.hadoop.odps.access.id", "xxx")
                        .config("spark.hadoop.odps.access.key", "xxx")
                        .config("spark.hadoop.odps.end.point", "xxx")
                        .config("spark.hadoop.tunnel.end.point","xxx")
                        .config("spark.sql.catalogImplementation", "odps")
                        .config("spark.testing.memory", 2147480000)
                        .config("odps.exec.dynamic.partition.mode","nonstrict")
                        .config("spark.hadoop.fs.oss.accessKeyId","")
                        .config("spark.hadoop.fs.oss.accessKeySecret","")
                        .config("spark.hadoop.fs.oss.endpoint","xxx")
                        .getOrCreate();
            }else {
                //集群执行
                ss = SparkSession.builder().appName("ORACLE_TO_ODPS")               .config("odps.exec.dynamic.partition.mode","nonstrict").getOrCreate();
            }

        }

        return ss;

    }




备注:sparkstreaming+sparksql是可以使用的,注意sparksession只能有一个。

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值