spark sql+mysql+hbase

背景

项目中之前采集的数据存储在MySQL表中,约100万条,现需要将MySQL表中的数据按需求存储在HBase中。

准备工作

1)表结构(下述表结构仅用于演示)

MySQL:

CREATE TABLE `test` (
  `Id` int(11) NOT NULL AUTO_INCREMENT,
  `ip` varchar(15) NOT NULL DEFAULT '',
  `tm` int(11) NOT NULL DEFAULT '0',
  `mem` tinyint(3) NOT NULL DEFAULT '0',
  PRIMARY KEY (`Id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;

HBase:

create 'testWn','info'

rowkey格式:ip_tm

2)知识储备

参考Spark官网

url:要连接的JDBC URL。

dbtable:应该读取的JDBC表。可以使用括号中的子查询代替完整表。

partitionColumn, lowerBound, upperBound, numPartitions:这些options必须同时被指定。

其中,partitionColumn:分区字段,需要是官网中指定数据类型(partitionColumn must be a numeric, date, or timestamp column);
lowerBound和upperBound仅用于决定分区的大小,而不是用于过滤表中的行;
numPartitions:最大分区数量,必须为整数,当为0或负整数时,实际的分区数为1;不一定是最终的分区数量,例如当“upperBound - lowerBound< numPartitions”时,实际的分区数量是“upperBound - lowerBound”;在分区结果中,分区是连续的,可通过结果打印得出。

具体实现

pom.xml

<dependencies>
        <!--mysql-->
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>${mysql.version}</version>
        </dependency>

        <!--sparksql+hbase-->
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_${scala.binary.version}</artifactId>
            <version>${spark.version}</version>
            <!--<scope>provided</scope>-->
        </dependency>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-client</artifactId>
            <version>${hbase.version}</version>
            <exclusions>
                <exclusion>
                    <groupId>org.slf4j</groupId>
                    <artifactId>slf4j-log4j12</artifactId>
                </exclusion>
                <exclusion>
                    <groupId>org.apache.hadoop</groupId>
                    <artifactId>hadoop-common</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
    </dependencies>

Main.java

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Table;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.sql.*;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

public class Main {
    public static void main(String[] args) throws Exception{
        SparkSession sparkSession = SparkSession.builder()
                .master("local[*]")
                .appName("test")
                .config("spark.sql.warehouse.dir","./datas/meta")
                .getOrCreate();

        long lowerBound = 1;
        long upperBound = 1000;
        for(;;){
            Dataset<Row> df = getMysqlDF(sparkSession,lowerBound,upperBound);

            System.out.println("上下限lowerBound--upperBound>>>>>"+lowerBound+"--"+upperBound);
            if(df.count()>0){
                System.out.println("df.count()>>>>>"+df.count());//df.count()的数值为table的count值
                JavaRDD<Row> javaRDDRow  = df.select("ip","tm","mem","Id").javaRDD();
                javaRDDRow.foreachPartition(new VoidFunction<Iterator<Row>>() {
                    @Override
                    public void call(Iterator<Row> rowIterator) throws Exception {
                        Connection connection = ConnectionFactory.createConnection(getHbaseConf());
                        TableName tableName = TableName.valueOf("DEVOPS:testWn");
                        Table table = connection.getTable(tableName);
                        List<Put> putList = new ArrayList<>();
                        StringBuilder idStrBld = new StringBuilder();
                        int i = 0;

                        while(rowIterator.hasNext()){
                            Row var1 = rowIterator.next();
                            String ip = String.valueOf(var1.get(0));
                            String tm = String.valueOf(var1.get(1));
                            String mem = String.valueOf(var1.get(2));
                            String id = String.valueOf(var1.get(3));
                            idStrBld.append(id).append(" ");
                            i++;

                            String rowkey = ip+"_"+tm;
                            String columnFamily = "info";
                            Put put = new Put(rowkey.getBytes());
                            put.addColumn(columnFamily.getBytes(), ("mem").getBytes(), mem.getBytes());

                            putList.add(put);
                        }
                        System.out.println("分区所有id大小--id值>>>>>"+i+"--"+idStrBld.toString());//id的打印主要是为了分区的理解
                        table.put(putList);
                        table.close();
                        connection.close();
                    }
                });
                lowerBound = lowerBound +df.count();
                upperBound=upperBound+df.count();
            }else{
                break;
            }
        }
        sparkSession.stop();
    }

    //mysql参数
    private static Dataset<Row> getMysqlDF(SparkSession sparkSession,long lowerBound,long upperBound){
        String url = "jdbc:mysql://localhost:3306/databaseName?user=***&password=***";

        StringBuilder tableStrBuilder = new StringBuilder();
        String table =tableStrBuilder.append("(select * from test where id between ")
                .append(lowerBound).append(" and ").append(upperBound).append(") as T").toString();

        Dataset<Row> df = sparkSession.read()
                .format("jdbc")
                .option("url", url)
                .option("dbtable", table)
                .option("partitionColumn","id")
                .option("lowerBound",lowerBound)
                .option("upperBound",upperBound)
                .option("numPartitions", 5)
                .load();
        return df;
    }

    //hbase参数
    private static Configuration getHbaseConf(){
        Configuration conf = HBaseConfiguration.create();
        conf.set("hbase.zookeeper.property.clientPort", "2181");
        conf.set("hbase.zookeeper.quorum", "***");
        conf.set("hbase.master", "***:60000");
        return conf;
    }
}


 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值