shc是hortonworks出品的开源方案,基于spark的特性,分片处理,并通过谓词下推,提高处理性能。
1. 引入依赖包
<dependency>
<groupId>com.hortonworks</groupId>
<artifactId>shc-core</artifactId>
<version>1.1.1-2.1-s_2.11</version>
<scope>provided</scope>
</dependency><dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>1.3.1</version>
<scope>provided</scope>
</dependency><dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.1.0</version>
<scope>provided</scope>
</dependency><dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.1.0</version>
<scope>provided</scope>
</dependency>
2. Spark读HBASE
package net.ben;
import java.util.HashMap;
import java.util.Map;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.execution.datasources.hbase.HBaseTableCatalog;
public final class App {
/**
* Says hello to the world.
*
* @param args The arguments of the program.
*/
public static void main(String[] args) {
SparkSession sparkSession = SparkSession.builder().appName("SparkHBaseETLDemo").getOrCreate();
String catalog = "{" + " \"table\":{\"namespace\":\"default\", \"name\":\"table1\"},"
+ " \"rowkey\":\"key\","
+ " \"columns\":{"
+ " \"col0\":{\"cf\":\"rowkey\", \"col\":\"key\", \"type\":\"string\"},"
+ " \"id\":{\"cf\":\"CF\", \"col\":\"id\", \"type\":\"string\"},"
+ " \"name\":{\"cf\":\"CF\", \"col\":\"name\", \"type\":\"string\"},"
+ " \"age\":{\"cf\":\"CF\", \"col\":\"age\", \"type\":\"string\"},"
+ " \"gender\":{\"cf\":\"CF\", \"col\":\"gender\", \"type\":\"string\"}"
+ " }" + " }";
Map<String, String> options = new HashMap<>();
options.put(HBaseTableCatalog.tableCatalog(), catalog);
Dataset<Row> df = sparkSession.read().options(options)
.format("org.apache.spark.sql.execution.datasources.hbase").load();
df.createOrReplaceTempView("table1");
Dataset<Row> r = sparkSession.sql("SELECT * from table1");
r.show();
sparkSession.close();
}
}
3. 执行
bin/spark-submit --master yarn --deploy-mode cluster --executor-cores 2 --conf spark.yarn.keytab="user.keytab" --conf spark.yarn.principal="user" --class net.ben.App /home/abc/spark-hbase-etl-1.0-SNAPSHOT-jar-with-dependencies.jar
4. 注意
shc-core-1.1.1-2.1-s_2.11.jar 这个包在client模式下必须放到spark的依赖包目录中,cluster模型需要把包上传到hdfs中,hbase-site.xml的配置文件加入到spark的配置中,或者在命令行中加入--files制定。