package com.mobanker.mongo2hive.Mongo2Hive;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.hive.HiveContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.bson.Document;
import com.mongodb.spark.MongoSpark;
import java.io.File;
import java.util.ArrayList;
import java.util.List;public classMongo2Hive {public static voidmain(String[] args) {//spark 2.x
String warehouseLocation = new File("spark-warehouse").getAbsolutePath();
SparkSession spark=SparkSession.builder()
.master("local[2]")
.appName("SparkReadMgToHive")
.config("spark.sql.warehouse.dir", warehouseLocation)
.config("spark.mongodb.input.uri", "mongodb://10.40.20.47:27017/test_db.test_table")
.enableHiveSupport()
.getOrCreate();
JavaSparkContext sc= newJavaSparkContext(spark.sparkContext());//spark 1.x//JavaSparkContext sc = new JavaSparkContext(conf);//sc.addJar("/Users/mac/zhangchun/jar/mongo-spark-connector_2.11-2.2.2.jar");//sc.addJar("/Users/mac/zhangchun/jar/mongo-java-driver-3.6.3.jar");//SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("SparkReadMgToHive");//conf.set("spark.mongodb.input.uri", "mongodb://127.0.0.1:27017/test.mgtest");//conf.set("spark. serializer","org.apache.spark.serializer.KryoSerialzier");//HiveContext sqlContext = new HiveContext(sc);// //create df from mongo//Dataset df = MongoSpark.read(sqlContext).load().toDF();//df.select("id","name","name").show();
String querysql= "select id,name,location,sex,position from mgtohive_2 b";
String opType="P";
SQLUtils sqlUtils= newSQLUtils();
List column =sqlUtils.getColumns(querysql);//create rdd from mongo
JavaRDD rdd =MongoSpark.load(sc);//将Document转成Object
JavaRDD Ordd = rdd.map(new Function() {publicObject call(Document document){
List list= newArrayList();for (int i = 0; i < column.size(); i++) {
list.add(String.valueOf(document.get(column.get(i))));
}returnlist;//return list.toString().replace("[","").replace("]","");
}
});
System.out.println(Ordd.first());//通过编程方式将RDD转成DF
List ls= newArrayList();for (int i = 0; i < column.size(); i++) {
ls.add(column.get(i));
}
String schemaString= ls.toString().replace("[","").replace("]","").replace(" ","");
System.out.println(schemaString);
List fields = new ArrayList();for (String fieldName : schemaString.split(",")) {
StructField field= DataTypes.createStructField(fieldName, DataTypes.StringType, true);
fields.add(field);
}
StructType schema=DataTypes.createStructType(fields);
JavaRDD rowRDD = Ordd.map((Function) record ->{
List fileds=(List) record;//String[] attributes = record.toString().split(",");
returnRowFactory.create(fileds.toArray());
});
Dataset df =spark.createDataFrame(rowRDD,schema);//将DF写入到Hive中//选择Hive数据库
spark.sql("use datalake");//注册临时表
df.registerTempTable("mgtable");if ("O".equals(opType.trim())) {
System.out.println("数据插入到Hive ordinary table");
Long t1=System.currentTimeMillis();
spark.sql("insert into mgtohive_2" + querysql + " " + "where b.id not in (select id from mgtohive_2)");
System.out.println("insert into mgtohive_2" + querysql + " ");
Long t2=System.currentTimeMillis();
System.out.println("共耗时:" + (t2 - t1) / 60000 + "分钟");
}else if ("P".equals(opType.trim())) {
System.out.println("数据插入到Hive dynamic partition table");
Long t3=System.currentTimeMillis();//必须设置以下参数 否则报错
spark.sql("set hive.exec.dynamic.partition.mode=nonstrict");//sex为分区字段 select语句最后一个字段必须是sex
spark.sql("insert into mg_hive_external partition(sex) select id,name,location,position,sex from mgtable b where b.id not in (select id from mg_hive_external)");
Long t4=System.currentTimeMillis();
System.out.println("共耗时:"+(t4 -t3)/60000+ "分钟");
}
spark.stop();
}
}
本文展示了如何使用Java和Spark从MongoDB读取数据并将其写入Hive表。首先创建SparkSession,配置MongoDB输入URI,然后加载数据并转换为DataFrame。数据经过处理后,被写入到指定的Hive表中,根据操作类型选择普通表或动态分区表进行插入操作。
1万+

被折叠的 条评论
为什么被折叠?



