代码磨了半天做个记录(删了业务相关的代码):
spark-2.4.0
Oracle2ODPS例子:
import aliyun.spark.test.odps2oracle.util.MessageMapperUtilOracle;
import aliyun.spark.test.util.PropertiesUtil;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
/**
* 读取odps上的数据,写入oracle数据库
* args[0] 运行模式,local或cluster
* args[1] odps源表名
* args[2] oracle结果表名
* args[3] numPartitions 分区数量
*/
public class Oracle2ODPS {
public static void main(String[] args) {
System.out.println("args[]:" + Arrays.toString(args));
SparkSession ss;
if(args[0] != null && args[0].equals("local")){
// 本地执行
// 需设置spark.master为local[N]才能直接运行,N为并发数
ss = SparkSession.builder().appName("ORACLE_TO_ODPS").config("spark.master", "local[2]")
.config("spark.hadoop.odps.project.name", "xxxxx")
.config("spark.hadoop.odps.access.id", "xxxxx")
.config("spark.hadoop.odps.access.key", "xxxxx")
.config("spark.hadoop.odps.end.point", "xxxxx")
.config("spark.hadoop.tunnel.end.point","xxxxx")
.config("spark.sql.catalogImplementation", "odps")
.config("spark.testing.memory", 2147480000)
.config("odps.exec.dynamic.partition.mode","nonstrict")
.config("spark.hadoop.fs.oss.accessKeyId","")
.config("spark.hadoop.fs.oss.accessKeySecret","")
.config("spark.hadoop.fs.oss.endpoint","oss-cn-beijing.aliyuncs.com")
.getOrCreate();
}else {
//集群执行
ss = SparkSession.builder().appName("ORACLE_TO_ODPS").config("odps.exec.dynamic.partition.mode","nonstrict").getOrCreate();
}
// 读oracle数据
Dataset<Row> jdbcJcDF = ss.read().format("jdbc")
.option("driver", PropertiesUtil.getProperty("oracle.driver"))
.option("url", PropertiesUtil.getProperty("oracle.url"))
.option("user", PropertiesUtil.getProperty("oracle.user"))
.option("password", PropertiesUtil.getProperty("oracle.password"))
.option("dbtable", "(select MOD(ASCII(SUBSTR(ROWID, -1)), 20) as RN ,q.* from " + oracleTable +" q)")
.option("numPartitions",4)
.option("partitionColumn", "RN")
.option("lowerBound", 0)
.option("upperBound", 20)
.option("fetchsize", "10000").load();
// jdbcDF.show();
System.out.println("开始写入...");
// 写分区表
jdbcJcDF.createOrReplaceTempView(oracleTable + "_TMP");
ss.sql("insert overwrite table XXX_ALL partition (DS='" + ds + "') select " + allColume + " from " + oracleTable + "_TMP");
ss.close();
System.out.println("spark run over!");
}
public static String fieldConvertToOdpsField(String name, String dataType) {
// 代码删除了
}
/**
* 创建odps上分区表
* SparkSession ss
* String database 指定数据库
* String mapperName 指定mapper对应的表
* Boolean flag 是否删表重建
* @author Administrator
*/
public static String createOdpsTableAll(SparkSession ss, String database, String mapperName, Boolean flag){
// 代码删除了
}
public static String createOdpsETLTableAll(SparkSession ss, String database, String mapperName, Boolean flag){
// 代码删除了
}
private static String calculateDateString(String dateString , int dayNum , String dateFormat){
Calendar calendar = Calendar.getInstance();
SimpleDateFormat formatter = new SimpleDateFormat(dateFormat);
Date date = null ;
try {
date = formatter.parse(dateString);
} catch (ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
calendar.setTime(date);
calendar.add(Calendar.DAY_OF_MONTH, dayNum);
return formatter.format(calendar.getTime());
}
}
Kafka2ODPS例子:
import aliyun.spark.test.util.SparkSessionSingleton;
import com.alibaba.fastjson.JSONObject;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka010.ConsumerStrategies;
import org.apache.spark.streaming.kafka010.KafkaUtils;
import org.apache.spark.streaming.kafka010.LocationStrategies;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
/**
* 这个代码比较全点, 删了一些业务相关的
*/
public class SparkKafka2Odps {
public static void main(String[] args) throws InterruptedException {
Date nowDate = new Date();
SimpleDateFormat nowFormat = new SimpleDateFormat("yyyyMMdd");
String sysDate = nowFormat.format(nowDate);
String ds = calculateDateString(sysDate, 0, "yyyyMMdd");
SparkConf conf = new SparkConf().setAppName("sparkStreaming-kafka-odps")
.set("spark.driver.allowMultipleContexts","true")
.set("spark.task.maxFailures","10")
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.set("spark.hadoop.odps.project.name", "xxx\")
.set("spark.hadoop.odps.access.id", "xxx")
.set("spark.hadoop.odps.access.key", "xxx")
.set("spark.hadoop.odps.end.point", "xxx")
.set("spark.hadoop.tunnel.end.point","xxx")
.set("spark.sql.catalogImplementation", "odps")
.set("odps.exec.dynamic.partition.mode","nonstrict")
.set("spark.hadoop.fs.oss.accessKeyId","")
.set("spark.hadoop.fs.oss.accessKeySecret","")
.set("spark.hadoop.fs.oss.endpoint","oss-cn-beijing.aliyuncs.com")
.setMaster("local[1]");
JavaStreamingContext streamingContext = new JavaStreamingContext(conf, Durations.seconds(1));
streamingContext.sparkContext().setLogLevel("WARN");
String schemaString = createOdpsTable(SparkSessionSingleton.getInstance("local"), "", mapperName,odpsTableName, true);
// System.out.println(schemaString);
Map<String, Object> kafkaParams = new HashMap<>();
kafkaParams.put("bootstrap.servers", "xxx:9092,xxx:9092,xxx:9092");
kafkaParams.put("key.deserializer", StringDeserializer.class);
kafkaParams.put("value.deserializer", StringDeserializer.class);
kafkaParams.put("group.id", "xxxxxx");
kafkaParams.put("auto.offset.reset", "earliest");
// kafkaParams.put("auto.offset.reset", "latest");
kafkaParams.put("enable.auto.commit", false);
Set<String> topicsSet = new HashSet<>(Arrays.asList("xxxxxx"));
final JavaInputDStream<ConsumerRecord<String, String>> stream =
KafkaUtils.createDirectStream(
streamingContext,
LocationStrategies.PreferConsistent(),
ConsumerStrategies.<String, String>Subscribe(Arrays.asList(topicsSet.toArray(new String[0])), kafkaParams)
);
JavaDStream<ConsumerRecord<String, String>> repartition = stream.repartition(6);
JavaDStream<String> lines = repartition.map(ConsumerRecord::value);
OracleTableMapper tableMapperByOracle = MessageMapperUtilOracle.getTableMapper("", mapperName);
List<OracleColumnMapper> columns = tableMapperByOracle.getColumns();
List<StructField> fields = new ArrayList<>();
for (OracleColumnMapper c : columns){
StructField field = DataTypes.createStructField(c.getName(), fieldConvertToSparkField(c.getDataType()), true);
fields.add(field);
}
StructType schema = DataTypes.createStructType(fields);
StructField[] filedArray = schema.fields();
String[] split = new String[filedArray.length];
for (int i = 0; i < filedArray.length; i++) {
StructField structField = filedArray[i];
String name = structField.name();
split[i] = name;
}
// Convert RDDs of the words DStream to DataFrame and run SQL query
/*************START****************/
lines.foreachRDD((rdd,time)->{
// Get the singleton instance of SparkSession
SparkSession spark = SparkSessionSingleton.getInstance("local");
JavaRDD<Row> rowRDD = rdd.filter( str -> {
if("".equals(str) || str == null){
return false;
}
return true;
}).map((Function<String, Row>) record -> {
// System.out.println(record);
Map map = (Map)JSONObject.parse(record);
Object[] objects = new Object[filedArray.length];
for (int i = 0; i < objects.length; i++) {
objects[i] = String.valueOf(map.get(split[i].toLowerCase()));
}
return RowFactory.create(objects);
});
Dataset<Row> dataFrame = spark.createDataFrame(rowRDD, schema);
// dataFrame.printSchema();
dataFrame.createOrReplaceTempView(mapperName+"_TMP");
//Dataset<Row> wordCountsDataFrame = spark.sql("select * from " + mapperName+"_TMP limit 10 ");
spark.sql("insert into table " + odpsTableName + " partition (DS='" + ds + "') select * from "+mapperName+"_TMP");
//wordCountsDataFrame.show();
System.out.println("========= " + time + "=========");
});
/*************END****************/
streamingContext.start();
streamingContext.awaitTermination();
}
/**
* 创建odps上分区表
* SparkSession ss
* String database 指定数据库
* String mapperName 指定mapper对应的表
* Boolean flag 是否删表重建
* @author Administrator
*/
public static String createOdpsTable(SparkSession ss, String database, String mapperName,String odpsTableName, Boolean flag){
String allColume = "";
OracleTableMapper tableMapperByOracle = MessageMapperUtilOracle.getTableMapper(database,mapperName);
if(tableMapperByOracle!=null){
List<OracleColumnMapper> columns = tableMapperByOracle.getColumns();
String str1 = "CREATE TABLE IF NOT EXISTS " + odpsTableName + " (";
String str2 = "";
for (OracleColumnMapper c : columns){
str2 += c.getName() + " " + fieldConvertToOdpsField(c.getName(),c.getDataType()) + ", ";
allColume += c.getName() + ", ";
}
String str3 = str2.substring(0, str2.length() -2);
String str4 = ") PARTITIONED BY (DS STRING)";
String str5 = str1 + str3 + str4;
System.out.println(mapperName + "建表语句: " + str5);
allColume = allColume.substring(0, allColume.length() -2);
// 是否删表
if (flag) {
ss.sql("drop table " + odpsTableName);
}
Dataset<Row> sql = ss.sql(str5);
}
return allColume;
}
public static String fieldConvertToOdpsField(String name, String dataType) {
String str = null;
if("NUMBER".equals(dataType)) {
str = "BIGINT";
}else if("VARCHAR2".equals(dataType)) {
str = "STRING";
}else if("DATE".equals(dataType)) {
str = "STRING";
}else{
str = "DOUBLE";
}
return str;
}
public static DataType fieldConvertToSparkField(String name) {
if("NUMBER".equals(name)) {
return DataTypes.StringType;
}else if("VARCHAR2".equals(name)) {
return DataTypes.StringType;
}else if("DATE".equals(name)) {
// return DataTypes.DateType;
return DataTypes.StringType;
} else if("DECIMAL".equals(name)) {
// return DataTypes.DateType;
return DataTypes.StringType;
}
else{
return DataTypes.StringType;
}
}
private static String calculateDateString(String dateString , int dayNum , String dateFormat){
Calendar calendar = Calendar.getInstance();
SimpleDateFormat formatter = new SimpleDateFormat(dateFormat);
Date date = null ;
try {
date = formatter.parse(dateString);
} catch (ParseException e) {
e.printStackTrace();
}
calendar.setTime(date);
calendar.add(Calendar.DAY_OF_MONTH, dayNum);
return formatter.format(calendar.getTime());
}
}
// 单例的sparksession
public class SparkSessionSingleton {
private static SparkSession ss;
public static SparkSession getInstance(String flag){
if (ss == null){
if("local".equals(flag)){
ss = SparkSession.builder()
.config("spark.master", "local[2]")
.config("spark.hadoop.odps.project.name", "xxx")
.config("spark.hadoop.odps.access.id", "xxx")
.config("spark.hadoop.odps.access.key", "xxx")
.config("spark.hadoop.odps.end.point", "xxx")
.config("spark.hadoop.tunnel.end.point","xxx")
.config("spark.sql.catalogImplementation", "odps")
.config("spark.testing.memory", 2147480000)
.config("odps.exec.dynamic.partition.mode","nonstrict")
.config("spark.hadoop.fs.oss.accessKeyId","")
.config("spark.hadoop.fs.oss.accessKeySecret","")
.config("spark.hadoop.fs.oss.endpoint","xxx")
.getOrCreate();
}else {
//集群执行
ss = SparkSession.builder().appName("ORACLE_TO_ODPS") .config("odps.exec.dynamic.partition.mode","nonstrict").getOrCreate();
}
}
return ss;
}
备注:sparkstreaming+sparksql是可以使用的,注意sparksession只能有一个。