背景
项目中之前采集的数据存储在MySQL表中,约100万条,现需要将MySQL表中的数据按需求存储在HBase中。
准备工作
1)表结构(下述表结构仅用于演示)
MySQL:
CREATE TABLE `test` (
`Id` int(11) NOT NULL AUTO_INCREMENT,
`ip` varchar(15) NOT NULL DEFAULT '',
`tm` int(11) NOT NULL DEFAULT '0',
`mem` tinyint(3) NOT NULL DEFAULT '0',
PRIMARY KEY (`Id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
HBase:
create 'testWn','info'
rowkey格式:ip_tm
2)知识储备
参考Spark官网
url:要连接的JDBC URL。
dbtable:应该读取的JDBC表。可以使用括号中的子查询代替完整表。
partitionColumn, lowerBound, upperBound, numPartitions:这些options必须同时被指定。
其中,partitionColumn:分区字段,需要是官网中指定数据类型(partitionColumn must be a numeric, date, or timestamp column);
lowerBound和upperBound仅用于决定分区的大小,而不是用于过滤表中的行;
numPartitions:最大分区数量,必须为整数,当为0或负整数时,实际的分区数为1;不一定是最终的分区数量,例如当“upperBound - lowerBound< numPartitions”时,实际的分区数量是“upperBound - lowerBound”;在分区结果中,分区是连续的,可通过结果打印得出。
具体实现
pom.xml
<dependencies>
<!--mysql-->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>${mysql.version}</version>
</dependency>
<!--sparksql+hbase-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<!--<scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>${hbase.version}</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
</dependencies>
Main.java
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Table;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.sql.*;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
public class Main {
public static void main(String[] args) throws Exception{
SparkSession sparkSession = SparkSession.builder()
.master("local[*]")
.appName("test")
.config("spark.sql.warehouse.dir","./datas/meta")
.getOrCreate();
long lowerBound = 1;
long upperBound = 1000;
for(;;){
Dataset<Row> df = getMysqlDF(sparkSession,lowerBound,upperBound);
System.out.println("上下限lowerBound--upperBound>>>>>"+lowerBound+"--"+upperBound);
if(df.count()>0){
System.out.println("df.count()>>>>>"+df.count());//df.count()的数值为table的count值
JavaRDD<Row> javaRDDRow = df.select("ip","tm","mem","Id").javaRDD();
javaRDDRow.foreachPartition(new VoidFunction<Iterator<Row>>() {
@Override
public void call(Iterator<Row> rowIterator) throws Exception {
Connection connection = ConnectionFactory.createConnection(getHbaseConf());
TableName tableName = TableName.valueOf("DEVOPS:testWn");
Table table = connection.getTable(tableName);
List<Put> putList = new ArrayList<>();
StringBuilder idStrBld = new StringBuilder();
int i = 0;
while(rowIterator.hasNext()){
Row var1 = rowIterator.next();
String ip = String.valueOf(var1.get(0));
String tm = String.valueOf(var1.get(1));
String mem = String.valueOf(var1.get(2));
String id = String.valueOf(var1.get(3));
idStrBld.append(id).append(" ");
i++;
String rowkey = ip+"_"+tm;
String columnFamily = "info";
Put put = new Put(rowkey.getBytes());
put.addColumn(columnFamily.getBytes(), ("mem").getBytes(), mem.getBytes());
putList.add(put);
}
System.out.println("分区所有id大小--id值>>>>>"+i+"--"+idStrBld.toString());//id的打印主要是为了分区的理解
table.put(putList);
table.close();
connection.close();
}
});
lowerBound = lowerBound +df.count();
upperBound=upperBound+df.count();
}else{
break;
}
}
sparkSession.stop();
}
//mysql参数
private static Dataset<Row> getMysqlDF(SparkSession sparkSession,long lowerBound,long upperBound){
String url = "jdbc:mysql://localhost:3306/databaseName?user=***&password=***";
StringBuilder tableStrBuilder = new StringBuilder();
String table =tableStrBuilder.append("(select * from test where id between ")
.append(lowerBound).append(" and ").append(upperBound).append(") as T").toString();
Dataset<Row> df = sparkSession.read()
.format("jdbc")
.option("url", url)
.option("dbtable", table)
.option("partitionColumn","id")
.option("lowerBound",lowerBound)
.option("upperBound",upperBound)
.option("numPartitions", 5)
.load();
return df;
}
//hbase参数
private static Configuration getHbaseConf(){
Configuration conf = HBaseConfiguration.create();
conf.set("hbase.zookeeper.property.clientPort", "2181");
conf.set("hbase.zookeeper.quorum", "***");
conf.set("hbase.master", "***:60000");
return conf;
}
}