spark mongo java_spark读取mongodb数据写入hive表中

本文展示了如何使用Java和Spark从MongoDB读取数据并将其写入Hive表。首先创建SparkSession,配置MongoDB输入URI,然后加载数据并转换为DataFrame。数据经过处理后,被写入到指定的Hive表中,根据操作类型选择普通表或动态分区表进行插入操作。
摘要由CSDN通过智能技术生成

package com.mobanker.mongo2hive.Mongo2Hive;

import org.apache.spark.SparkConf;

import org.apache.spark.api.java.JavaRDD;

import org.apache.spark.api.java.JavaSparkContext;

import org.apache.spark.api.java.function.Function;

import org.apache.spark.sql.Dataset;

import org.apache.spark.sql.Row;

import org.apache.spark.sql.RowFactory;

import org.apache.spark.sql.SparkSession;

import org.apache.spark.sql.hive.HiveContext;

import org.apache.spark.sql.types.DataTypes;

import org.apache.spark.sql.types.StructField;

import org.apache.spark.sql.types.StructType;

import org.bson.Document;

import com.mongodb.spark.MongoSpark;

import java.io.File;

import java.util.ArrayList;

import java.util.List;public classMongo2Hive {public static voidmain(String[] args) {//spark 2.x

String warehouseLocation = new File("spark-warehouse").getAbsolutePath();

SparkSession spark=SparkSession.builder()

.master("local[2]")

.appName("SparkReadMgToHive")

.config("spark.sql.warehouse.dir", warehouseLocation)

.config("spark.mongodb.input.uri", "mongodb://10.40.20.47:27017/test_db.test_table")

.enableHiveSupport()

.getOrCreate();

JavaSparkContext sc= newJavaSparkContext(spark.sparkContext());//spark 1.x//JavaSparkContext sc = new JavaSparkContext(conf);//sc.addJar("/Users/mac/zhangchun/jar/mongo-spark-connector_2.11-2.2.2.jar");//sc.addJar("/Users/mac/zhangchun/jar/mongo-java-driver-3.6.3.jar");//SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("SparkReadMgToHive");//conf.set("spark.mongodb.input.uri", "mongodb://127.0.0.1:27017/test.mgtest");//conf.set("spark. serializer","org.apache.spark.serializer.KryoSerialzier");//HiveContext sqlContext = new HiveContext(sc);// //create df from mongo//Dataset df = MongoSpark.read(sqlContext).load().toDF();//df.select("id","name","name").show();

String querysql= "select id,name,location,sex,position from mgtohive_2 b";

String opType="P";

SQLUtils sqlUtils= newSQLUtils();

List column =sqlUtils.getColumns(querysql);//create rdd from mongo

JavaRDD rdd =MongoSpark.load(sc);//将Document转成Object

JavaRDD Ordd = rdd.map(new Function() {publicObject call(Document document){

List list= newArrayList();for (int i = 0; i < column.size(); i++) {

list.add(String.valueOf(document.get(column.get(i))));

}returnlist;//return list.toString().replace("[","").replace("]","");

}

});

System.out.println(Ordd.first());//通过编程方式将RDD转成DF

List ls= newArrayList();for (int i = 0; i < column.size(); i++) {

ls.add(column.get(i));

}

String schemaString= ls.toString().replace("[","").replace("]","").replace(" ","");

System.out.println(schemaString);

List fields = new ArrayList();for (String fieldName : schemaString.split(",")) {

StructField field= DataTypes.createStructField(fieldName, DataTypes.StringType, true);

fields.add(field);

}

StructType schema=DataTypes.createStructType(fields);

JavaRDD rowRDD = Ordd.map((Function) record ->{

List fileds=(List) record;//String[] attributes = record.toString().split(",");

returnRowFactory.create(fileds.toArray());

});

Dataset df =spark.createDataFrame(rowRDD,schema);//将DF写入到Hive中//选择Hive数据库

spark.sql("use datalake");//注册临时表

df.registerTempTable("mgtable");if ("O".equals(opType.trim())) {

System.out.println("数据插入到Hive ordinary table");

Long t1=System.currentTimeMillis();

spark.sql("insert into mgtohive_2" + querysql + " " + "where b.id not in (select id from mgtohive_2)");

System.out.println("insert into mgtohive_2" + querysql + " ");

Long t2=System.currentTimeMillis();

System.out.println("共耗时:" + (t2 - t1) / 60000 + "分钟");

}else if ("P".equals(opType.trim())) {

System.out.println("数据插入到Hive dynamic partition table");

Long t3=System.currentTimeMillis();//必须设置以下参数 否则报错

spark.sql("set hive.exec.dynamic.partition.mode=nonstrict");//sex为分区字段 select语句最后一个字段必须是sex

spark.sql("insert into mg_hive_external partition(sex) select id,name,location,position,sex from mgtable b where b.id not in (select id from mg_hive_external)");

Long t4=System.currentTimeMillis();

System.out.println("共耗时:"+(t4 -t3)/60000+ "分钟");

}

spark.stop();

}

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值