Spark实现MySQL到HBase的数据迁移,其效率比Sqoop快数十倍。该代码识别MySQL表的主键作为HBase的行键,遇到联合主键则将主键用下划线连接后作为HBase的行键。
1. 下图为工程目录结构
2. MySQL2HBase.scala
import java.sql.{DriverManager, SQLException} import org.apache.hadoop.hbase.mapreduce.TableOutputFormat import org.apache.spark._ import org.apache.hadoop.mapreduce.Job import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.client.Result import org.apache.hadoop.hbase.client.Put import org.apache.hadoop.hbase.util.Bytes import org.apache.spark.sql.SparkSession import org.apache.spark.sql.Row import scala.collection.mutable.ArrayBuffer object MySQL2HBase { def main(args: Array[String]): Unit = { dothis(args) } def dothis(args: Array[String]): Unit = { // 0. 函数参数列表 println("main function's arguments length is: " + args.length) print("main function's arguments are: ") args.foreach(arg => {print(arg + " ")}) println("") val mysqlHost = args.apply(0) // mysql主机ip"master" val mysqlDatabase = args.apply(1) // mysql中被导入的数据库名"bigdata" val mysqlTableName = args.apply(2) // mysql中被导入的数据表名"missioninfo" val mysqlUserName = args.apply(3) // mysql用户名"******" val mysqlPassWord = args.apply(4) // mysql密码"******" val hbaseTableName = args.apply(5) // 导入hbase的表名"bigdata" val hbaseColFamName = args.apply(6) // 导入hbase的列族"labels" // 1. sprakConf和saprkSession println("-------------------sprakConf和saprkSession---------------------") val sparkConf = new SparkConf().setAppName("MySQL2HBase")//.setMaster("yarn-cluster") //spark://master:7077 val spark = SparkSession.builder().config(sparkConf).getOrCreate() // 2. JDBC连接MySQL读取数据 println("-------------------JDBC连接MySQL读取数据---------------------") val jdbcDF = spark.read.format("jdbc").option("url", "jdbc:mysql://" + mysqlHost + ":3306/" + mysqlDatabase + "?useSSL=false") .option("driver","com.mysql.jdbc.Driver") .option("dbtable", mysqlTableName) .option("user", mysqlUserName) .option("password", mysqlPassWord).load() // 2.1 获取sql查询结果的字段名称及其数据类型 val types = jdbcDF.dtypes // 2.2 获取表的所有主键 val keyArrayBuff = getTableKeys(mysqlHost, mysqlDatabase, mysqlTableName, mysqlUserName, mysqlPassWord) // 2.3 将查询结果写入HBase val rdd = jdbcDF.rdd.map(row => { // 2.3.1 获取当前行的主键对应的值,多主键则用下划线"_"将主键的值连接后作为HBase的行键 var key = new String for (j <- 0 to (types.length-1)) { if (types.apply(j)._1 == keyArrayBuff(0)) { key = getString(types.apply(j) _2, row, j) } } for (i <- 1 to (keyArrayBuff.length-1)) { for (j <- 0 to (types.length-1)) { if (types.apply(j)._1 == keyArrayBuff(i)) { key += "_" key += getString(types.apply(j)_2, row, j) } } } // 2.3.2 准备写入HBase的数据结构ImmutableBytesWritable val put = new Put(Bytes.toBytes(key)) for (i <- 0 until row.size){ put.add(Bytes.toBytes(hbaseColFamName), Bytes.toBytes(types.apply(i)._1), Bytes.toBytes(getString(types.apply(i)._2, row, i))) } (new ImmutableBytesWritable, put) }) // 3. 准备sc println("-------------------准备sc---------------------") val sc = spark.sparkContext//new SparkContext(sparkConf) // 4. 写HBase sc.hadoopConfiguration.set(TableOutputFormat.OUTPUT_TABLE, hbaseTableName) val job = new Job(sc.hadoopConfiguration) job.setOutputKeyClass(classOf[ImmutableBytesWritable]) job.setOutputValueClass(classOf[Result]) job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]]) rdd.saveAsNewAPIHadoopDataset(job.getConfiguration()) } // 根据每个字段的字段类型调用不同的函数,将字段值转换为HBase可读取的字节形式,这样也解决了数字导入到HBase中变成乱码的问题 def getString(value_type: String, row: Row, i: Int): String = { var str = "" if ("IntegerType" == value_type) { str = row.getInt(i).toString } else if ("StringType" == value_type) { str = row.getString(i) } else if ("FloatType" == value_type) { str = row.getFloat(i).toString } else if ("DoubleType" == value_type) { str = row.getDouble(i).toString } else if ("TimestampType" == value_type) { str = row.getTimestamp(i).toString } str } // 获取表的主键,包括多主键的情况 def getTableKeys(mysqlHost: String, mysqlDB: String, mysqlTable: String, mysqlUser: String, mysqlPW: String):ArrayBuffer[String] ={ var keyArrayBuff = new ArrayBuffer[String]() try { Class.forName("com.mysql.jdbc.Driver") // 连接数据库中间件 try { val conn = DriverManager.getConnection("jdbc:MySQL://"+mysqlHost+":3306/"+mysqlDB+"?useUnicode=true&characterEncoding=utf8&useOldAliasMetadataBehavior=true", mysqlUser, mysqlPW) // 适用oracle,mysql val rs = conn.getMetaData().getPrimaryKeys(conn.getCatalog(), conn.getMetaData().getUserName(), mysqlTable) while (rs.next()) { keyArrayBuff += rs.getString("COLUMN_NAME") System.out.println(rs.getString("COLUMN_NAME")) } rs.close() conn.close() } catch { case e: SQLException => e.printStackTrace() } } catch { case e: ClassNotFoundException => e.printStackTrace() } keyArrayBuff } }
3. simple.sbt
name := "MySQL2HBase"
version := "1.0"
scalaVersion := "2.11.8"
libraryDependencies += "org.apache.spark" %% "spark-core" % "2.1.0"
libraryDependencies += "org.apache.spark" %% "spark-sql" % "2.1.0"
libraryDependencies += "org.apache.hbase" % "hbase-client" % "1.2.6"
libraryDependencies += "org.apache.hbase" % "hbase-common" % "1.2.6"
libraryDependencies += "org.apache.hbase" % "hbase-server" % "1.2.6"
4. 下图为sbt编译打包
5. spark-submit执行
spark-submit --class MySQL2HBase mysql2hbase_2.11-1.0.jar master world countrylanguage mysqlUsername mysqlPassWord world city
// mysql主机ip"master"
// mysql中被导入的数据库名"bigdata"
// mysql中被导入的数据表名"missioninfo"
// mysql用户名 mysqlUsername
// mysql密码"mysqlPassWord"
// 导入hbase的表名"bigdata"
// 导入hbase的列族"labels"