【java-spark】mysql2hbase:读取mysql数据插入hbase

java-spark 专栏收录该内容
7 篇文章 0 订阅

环境:jdk1.8、eclipse、maven、spark2.4.0


import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import org.apache.commons.lang3.ObjectUtils;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.SparkSession;

public class Ms2HbaseBySpark {
	static final Logger logger = LogManager.getLogger(Ms2HbaseBySpark.class);
	static String url = "jdbc:mysql://192.168.1.*:3306/xj?useUnicode=true&characterEncoding=UTF-8&serverTimezone=Asia/Shanghai&zeroDateTimeBehavior=CONVERT_TO_NULL";
	static String driver = "com.mysql.cj.jdbc.Driver";
	static String user = "root";
	static String passwd = "root";

	public static void main(String[] args) {
		String mysqlTable = "test_xj";
		String hbaseTable = "SUV:TEST_XJ";
		SparkConf conf = new SparkConf().setAppName("ms2hbase_yb");
		conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
//		conf.setMaster("local");
		SparkSession sparkSession = SparkSession.builder().config(conf).getOrCreate();
		SparkContext sc = sparkSession.sparkContext();
		JavaSparkContext jsc = JavaSparkContext.fromSparkContext(sc);
		SQLContext sqlContext = new SQLContext(sparkSession);
		// 设置数据库参数
		Properties connectionProperties = new Properties();
		connectionProperties.put("user", user);
		connectionProperties.put("password", passwd);
		connectionProperties.put("driver", driver);
		// 读取表中所有数据
		Dataset<Row> jdbcDS = sqlContext.read().jdbc(url, mysqlTable, connectionProperties).select("*");// *为所有字段
		 // 转换rdd
		JavaRDD<Row> javaRDD = jdbcDS.javaRDD();
		// 遍历rdd 存入hbase
		javaRDD.foreach(new VoidFunction<Row>() {
			@Override
			public void call(Row t) throws Exception {
				try {
					Object id = t.get(0);
					String name = (String) t.get(1);
					String ah = (String) t.get(2);
					String esIds = (String) t.get(3);
					if (esIds.length() > 0) {
						String[] split = esIds.split("!@#");
						for (String cc : split) {
							Put put = new Put(Bytes.toBytes(cc));
							put.addColumn(Bytes.toBytes("TCL"), Bytes.toBytes("name"), Bytes.toBytes(name));
							put.addColumn(Bytes.toBytes("TCL"), Bytes.toBytes("ah"), Bytes.toBytes(ah));
							put.addColumn(Bytes.toBytes("TCL"), Bytes.toBytes("source"), Bytes.toBytes(id.toString()));
							//存入hbase
							HBaseUtil.saveData(hbaseTable, put);
						}
					}

				} catch (Exception e) {
					logger.info(e);
				}
			}
		});

	}
}

上面的方法速度太慢,经分析是由于只有一个读取数据库sql语句,下面拆分sql语句

 

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import org.apache.commons.lang3.ObjectUtils;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hive.ql.parse.HiveParser.foreignKeyWithName_return;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.SparkSession;


public class Ms2HbaseBySpark {
	static final Logger logger = LogManager.getLogger(Ms2HbaseBySpark.class);
	
	static String url = "jdbc:mysql://192.168.1.*:3306/xj?useUnicode=true&characterEncoding=UTF-8&serverTimezone=Asia/Shanghai&zeroDateTimeBehavior=CONVERT_TO_NULL";
	static String driver = "com.mysql.cj.jdbc.Driver";
	static String user = "root";
	static String passwd = "root";

	public static void main(String[] args) {
		String mysqlTable = "test_xj";
		String hbaseTable = "SUV:TEST_XJ";
		String partition = "true";
		SparkConf conf = new SparkConf().setAppName("ms2hbase_yb");
		conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
		// conf.setMaster("local");
		SparkSession sparkSession = SparkSession.builder().config(conf).getOrCreate();
		SparkContext sc = sparkSession.sparkContext();
		JavaSparkContext jsc = JavaSparkContext.fromSparkContext(sc);
		SQLContext sqlContext = new SQLContext(sparkSession);
		// 设置数据库参数
		Properties connectionProperties = new Properties();
		connectionProperties.put("user", user);
		connectionProperties.put("password", passwd);
		connectionProperties.put("driver", driver);
		// 读取表中所有数据
//		Dataset<Row> jdbcDS = sqlContext.read().jdbc(url, mysqlTable, connectionProperties).select("*");// *

		Dataset<Row> all = null;
		//将需要获取的数据拆分多个sql
		List<String> sqls = getMysqlData(argsMap, mysqlTable);

		for (String sql : sqls) {
			logger.info("About to execute SQL ===>" + sql);
			Dataset<Row> load = sparkSession.read().format("jdbc").option("url", url).option("driver", driver)
					.option("user", user).option("password", passwd).option("dbtable", "(" + sql + ") t1").load();
			if (all == null) {
				all = load;
			} else {
				all = all.union(load);
			}
		}

		JavaRDD<Row> javaRDD = all.javaRDD();
		Boolean b = false;
		if (partition != null && partition.equals("true")) {
			b = true;
		}
		if (b) {
			javaRDD.foreachPartition(new VoidFunction<Iterator<Row>>() {
				@Override
				public void call(Iterator<Row> tall) throws Exception {
					while (tall.hasNext()) {
						Row t = tall.next();
						try {
							Object id = t.get(0);
							String name = (String) t.get(1);
							String anhao = (String) t.get(2);
							String esIds = (String) t.get(3);
							if (esIds.length() > 0) {
								String[] split = esIds.split("!@#");
								for (String cc : split) {
									Put put = new Put(Bytes.toBytes(PartitionUtil.getHexPartition(cc) + "_" + cc));
									put.addColumn(Bytes.toBytes("ETL"), Bytes.toBytes("name"), Bytes.toBytes(name));
									put.addColumn(Bytes.toBytes("ETL"), Bytes.toBytes("anhao"), Bytes.toBytes(anhao));
									put.addColumn(Bytes.toBytes("ETL"), Bytes.toBytes("source"),
											Bytes.toBytes(id.toString()));

									HBaseUtil.saveData(hbaseTable, put);
								}
							}
						} catch (Exception e) {
							logger.info(e);
						}

					}
				}
			});
		}
		if (!b) {

			// 遍历rdd 存入hbase
			javaRDD.foreach(new VoidFunction<Row>() {
				@Override
				public void call(Row t) throws Exception {
					try {
						Object id = t.get(0);
						String name = (String) t.get(1);
						String ah= (String) t.get(2);
						String esIds = (String) t.get(3);
						if (esIds.length() > 0) {
							String[] split = esIds.split("!@#");
							for (String cc : split) {
								Put put = new Put(Bytes.toBytes(cc));
								put.addColumn(Bytes.toBytes("TCL"), Bytes.toBytes("name"), Bytes.toBytes(name));
								put.addColumn(Bytes.toBytes("TCL"), Bytes.toBytes("ah"), Bytes.toBytes(ah));
								put.addColumn(Bytes.toBytes("TCL"), Bytes.toBytes("source"),
										Bytes.toBytes(id.toString()));

								HBaseUtil.saveData(hbaseTable, put);
							}
						}

					} catch (Exception e) {
						logger.info(e);
					}
				}
			});
		}
	}
	//拆分sql
	private static List<String> getMysqlData(HashMap<String, String> argsMap, String mysqlTable) {
		List<String> sqls = new ArrayList<>();
		int parseInt = Integer.parseInt(argsMap.getOrDefault("count", "10"));
		for (int i = 0; i < parseInt; i++) {
			sqls.add("SELECT * FROM " + mysqlTable + " WHERE id%" + parseInt + "=" + i);
		}
		return sqls;
	}
}

  • 0
    点赞
  • 0
    评论
  • 2
    收藏
  • 一键三连
    一键三连
  • 扫一扫,分享海报

©️2021 CSDN 皮肤主题: 大白 设计师:CSDN官方博客 返回首页
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、C币套餐、付费专栏及课程。

余额充值