Spark迁移Hive数据到MongoDB(UpdateSave|UpsertSave)

Hive emp表数据

hive (soul)> select * from emp;
OK
emp.empno	emp.ename	emp.job	emp.age	emp.deptno
7369	SMITH	CLERK	24	10
7499	ALLEN	SALESMAN	30	20
7521	WARD	SALESMAN	25	30
7654	MARTIN	SALESMAN	23	10
7698	BLAKE	MANAGER	29	40

pom

  <properties>
    <scala.version>2.11.8</scala.version>
    <spark.version>2.2.0</spark.version>
    <hive.version>1.1.0</hive.version>
  </properties>

<dependency>
      <groupId>org.mongodb.spark</groupId>
      <artifactId>mongo-spark-connector_2.11</artifactId>
      <version>${spark.version}</version>
    </dependency>

    <dependency>
      <groupId>org.mongodb</groupId>
      <artifactId>mongo-java-driver</artifactId>
      <version>3.6.3</version>
    </dependency>


    <dependency>
      <groupId>org.mongodb</groupId>
      <artifactId>bson</artifactId>
      <version>3.4.0</version>
    </dependency>


    <!--SparkHive-->
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-hive_2.11</artifactId>
      <version>${spark.version}</version>
    </dependency>
    <!--MySQL Driver-->
    <dependency>
      <groupId>mysql</groupId>
      <artifactId>mysql-connector-java</artifactId>
      <version>5.1.39</version>
    </dependency>

    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-sql_2.11</artifactId>
      <version>${spark.version}</version>
    </dependency>

MongoDB 的updateSave(对已有数据进行更新) upsertSave(有就更新没有就插入)工具类

package com.soul.utils

import com.mongodb.client.MongoCollection
import com.mongodb.client.model.{ReplaceOneModel, UpdateOneModel}
import com.mongodb.spark.MongoConnector
import com.mongodb.spark.config.WriteConfig
import org.apache.spark.rdd.RDD
import org.bson.Document
import scala.collection.JavaConverters._
import scala.reflect.ClassTag

/**
  * @author soulChun
  * @create 2018-12-18-20:37
  */
object MongoUtils {


  val DefaultMaxBatchSize = 100000

  def updateSave[D: ClassTag](rdd: RDD[UpdateOneModel[Document]]): Unit = updateSave(rdd, WriteConfig(rdd.sparkContext))

  def updateSave[D: ClassTag](rdd: RDD[UpdateOneModel[D]], writeConfig: WriteConfig): Unit = {
    val mongoConnector = MongoConnector(writeConfig.asOptions)
    rdd.foreachPartition(iter => if (iter.nonEmpty) {
      mongoConnector.withCollectionDo(writeConfig, { collection: MongoCollection[D] =>
        iter.grouped(DefaultMaxBatchSize).foreach(batch => collection.bulkWrite(batch.toList.asJava))
      })
    })
  }


  def upsertSave[D: ClassTag](rdd: RDD[ReplaceOneModel[Document]]): Unit = upsertSave(rdd, WriteConfig(rdd.sparkContext))

  def upsertSave[D: ClassTag](rdd: RDD[ReplaceOneModel[D]], writeConfig: WriteConfig): Unit = {
    val mongoConnector = MongoConnector(writeConfig.asOptions)
    rdd.foreachPartition(iter => if (iter.nonEmpty) {
      mongoConnector.withCollectionDo(writeConfig, { collection: MongoCollection[D] =>
        iter.grouped(DefaultMaxBatchSize).foreach(batch => collection.bulkWrite(batch.toList.asJava))
      })
    })
  }


}

一、将DF存入MongoDB

package com.soul.sparkmg;
import com.mongodb.client.model.Filters;
import com.mongodb.client.model.ReplaceOneModel;
import com.mongodb.client.model.UpdateOneModel;
import com.mongodb.client.model.UpdateOptions;
import com.mongodb.spark.MongoSpark;
import com.soul.utils.MongoUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.hive.HiveContext;
import org.bson.Document;

/**
 * @author soulChun
 * @create 2018-12-15-16:17
 */
public class SparkHiveToMg {
    public static void main(String[] args) {

        SparkConf conf = new SparkConf().setAppName("SparkHiveToMg").setMaster("local[2]");
        //如何你的密码中有@符号 请用%40代替
        conf.set("spark.mongodb.output.uri", "mongodb://root:root@127.0.0.1/soul_db.emp");
        JavaSparkContext jsc =  new JavaSparkContext(conf);
        HiveContext hc = new HiveContext(jsc);
        Dataset<Row> df  =hc.table("soul.emp");

        //直接存DF到MongoDB
        MongoSpark.save(df);
        jsc.stop();
    }
}

启动程序会自动在MongoDB建表emp(emp是在uri中指定的,可以自己修改),然后将数据插入,发现五条数据已存入MongoDB。
在这里插入图片描述

二、将RDD存入MongoDB

package com.soul.sparkmg;

import com.mongodb.client.model.Filters;
import com.mongodb.client.model.ReplaceOneModel;
import com.mongodb.client.model.UpdateOneModel;
import com.mongodb.client.model.UpdateOptions;
import com.mongodb.spark.MongoSpark;
import com.soul.utils.MongoUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.hive.HiveContext;
import org.bson.Document;

/**
 * @author soulChun
 * @create 2018-12-15-16:17
 */
public class SparkHiveToMg {
    public static void main(String[] args) {

        SparkConf conf = new SparkConf().setAppName("SparkHiveToMg").setMaster("local[2]");
        //如何你的密码中有@符号 请用%40代替
        conf.set("spark.mongodb.output.uri", "mongodb://root:root@127.0.0.1/soul_db.emp");
        JavaSparkContext jsc =  new JavaSparkContext(conf);
        HiveContext hc = new HiveContext(jsc);
        Dataset<Row> df  =hc.table("soul.emp");

        //直接存DF到MongoDB
//        MongoSpark.save(df);

        JavaRDD<Row> rdd = df.toJavaRDD();
        //insert
        JavaRDD<Document> rddDoc= rdd.map(new Function<Row, Document>() {
            public Document call(Row row) throws Exception {
                Document doc = new Document();
                doc.put("empno",row.get(0));
                doc.put("ename",row.get(1));
                doc.put("job",row.get(2));
                doc.put("age",row.get(3));
                doc.put("deptno",row.get(4));
                return doc;
            }
        });
        MongoSpark.save(rddDoc);

        jsc.stop();
    }
}

三、对已有数据进行更新
将MongoDB中第一个文档的age改成100
在这里插入图片描述

然后运行以下程序

package com.soul.sparkmg;

import com.mongodb.client.model.Filters;
import com.mongodb.client.model.ReplaceOneModel;
import com.mongodb.client.model.UpdateOneModel;
import com.mongodb.client.model.UpdateOptions;
import com.mongodb.spark.MongoSpark;
import com.soul.utils.MongoUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.hive.HiveContext;
import org.bson.Document;

/**
 * @author soulChun
 * @create 2018-12-15-16:17
 */
public class SparkHiveToMg {
    public static void main(String[] args) {

        SparkConf conf = new SparkConf().setAppName("SparkHiveToMg").setMaster("local[2]");
        //如何你的密码中有@符号 请用%40代替
        conf.set("spark.mongodb.output.uri", "mongodb://root:root@127.0.0.1/soul_db.emp");
        JavaSparkContext jsc =  new JavaSparkContext(conf);
        HiveContext hc = new HiveContext(jsc);
        Dataset<Row> df  =hc.table("soul.emp");

        //直接存DF到MongoDB
//        MongoSpark.save(df);
        JavaRDD<Row> rdd = df.toJavaRDD();

        //update
        JavaRDD<UpdateOneModel<Document>> rddUpdate= rdd.map(new Function<Row, UpdateOneModel<Document>>() {
            public UpdateOneModel<Document> call(Row row) throws Exception {
                Document doc = new Document();
                doc.put("empno",row.get(0));
                doc.put("ename",row.get(1));
                doc.put("job",row.get(2));
                doc.put("age",row.get(3));
                doc.put("deptno",row.get(4));
                Document modifiers = new Document();
                modifiers.put("$set",doc);
                return new UpdateOneModel<Document>(Filters.eq("empno",doc.get("empno")),modifiers,new UpdateOptions().upsert(true));
            }
        });
        MongoUtils.updateSave(rddUpdate.rdd(),rddUpdate.classTag());
        jsc.stop();
    }
}

运行完毕后查看MongoDB中还是五条数据,而且age已更新为原有的30
在这里插入图片描述

四、对已有数据进行更新而且没有的进行插入
删除MongoDB中emp的4、5文档,而且将第一个文档的age改为200
在这里插入图片描述

然后运行以下程序

package com.soul.sparkmg;

import com.mongodb.client.model.Filters;
import com.mongodb.client.model.ReplaceOneModel;
import com.mongodb.client.model.UpdateOneModel;
import com.mongodb.client.model.UpdateOptions;
import com.mongodb.spark.MongoSpark;
import com.soul.utils.MongoUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.hive.HiveContext;
import org.bson.Document;

/**
 * @author soulChun
 * @create 2018-12-15-16:17
 */
public class SparkHiveToMg {
    public static void main(String[] args) {

        SparkConf conf = new SparkConf().setAppName("SparkHiveToMg").setMaster("local[2]");
        //如何你的密码中有@符号 请用%40代替
        conf.set("spark.mongodb.output.uri", "mongodb://root:root@127.0.0.1/soul_db.emp");
        JavaSparkContext jsc =  new JavaSparkContext(conf);
        HiveContext hc = new HiveContext(jsc);
        Dataset<Row> df  =hc.table("soul.emp");
        JavaRDD<Row> rdd = df.toJavaRDD();
        //upsert
        JavaRDD<ReplaceOneModel<Document>> rddUpsert= rdd.map(new Function<Row, ReplaceOneModel<Document>>() {
            public ReplaceOneModel<Document> call(Row row) throws Exception {
                Document doc = new Document();
                doc.put("empno",row.get(0));
                doc.put("ename",row.get(1));
                doc.put("job",row.get(2));
                doc.put("age",row.get(3));
                doc.put("deptno",row.get(4));
//                Document modifiers = new Document();
//                modifiers.put("$set",doc);
                return new ReplaceOneModel<Document>(Filters.eq("empno",doc.get("empno")),doc,new UpdateOptions().upsert(true));
            }
        });

        MongoUtils.upsertSave(rddUpsert.rdd(),rddUpsert.classTag());
        jsc.stop();
    }
}

会发现数据已恢复
在这里插入图片描述

如果对数据进行Update或者Upsert的时候记得将
Filters.eq(“empno”,doc.get(“empno”)
关联字段empno在MongoDB中设置成索引字段,可以提高性能。如果公司有调度平台(支持动态传参)可以将上面的内容改写成插件,支持任意Hive表的迁移。

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值