Hbase Bulk Load CSV
Hbase Bulk Load CSV
第一次尝试bulk load 方式CSV快速导入Hbase,踩了不少坑,终于成功了
数据
- 0001,1565764353,Alice,1990-01-01
- 0002,1565764332,Bob,1992-01-01
- 0003,1565764343,Cindy,1994-01-01
- 0001,1565764345,Alice,1992-02-01
命令方式
创建表,可设置多版本 VERSIONS:版本数量
create 'tablename',{NAME=>'student',VERSIONS=>2147483647}
使用ImportTsv,添加参数 -Dimporttsv.bulk.output:hdfs路径,产生Hfile
注意hbase导入必须把每一列都用上 -Dimporttsv.columns:这个可以指定每一列的列族和列名,包括可以指定HBASE_TS_KEY(时间戳),HBASE_ROW_KEY(RowKey)等
hbase org.apache.hadoop.hbase.mapreduce.ImportTsv -Dimporttsv.separator="," -Dimporttsv.bulk.output=/user/hbase_import/new/data -Dimporttsv.columns=HBASE_TS_KEY,HBASE_ROW_KEY,student:name,student:birth tablename /user/hbase_import/old
最后一步将生成的Hfile导入Hbase,这一步飞快
之前一直按照教程尝试hadoop jar hbase-server.jar 的方式zookeeper一直使用默认的,无法指定到自己的zookeeper上,不断报错,在老板的提示下换成了hbase的,果然好用!
hbase org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles -Dcreate.table=no /user/hbase_import/new/data tablename
最后一步导入时若遇到Trying to load more than 32 hfiles 的错误可以设置hbse-site.xml
hbase.mapreduce.bulkload.max.hfiles.perRegion.perFamily=3200
API接口调用方式
用了命令之后以为大功告成,但是突然发现命令不支持csv中文本的 , 的读取,它会当作csv的分隔符,包括换行符,它就当成下一条了,这就很苦恼了,所以选择了去调用API的方式,可以自定义读取方式
直接上代码
import java.io.File;
import java.io.IOException;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FsShell;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2;
import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject;
public class BulkLoadJob {
static Logger logger = LoggerFactory.getLogger(BulkLoadJob.class);
private static String[] headers = { "timestamp", "id", "name", "birth" };
public static class BulkLoadMap extends Mapper<LongWritable, List<Text>, ImmutableBytesWritable, Put> {
public void map(LongWritable key, List<Text> value, Context context) throws IOException, InterruptedException {
JSONObject data = Utils.constructJson(value, headers);
String hkey = data.getString("uid");
byte[] rowKey = null;
rowKey = Bytes.toBytes(hkey);
final ImmutableBytesWritable HKey = new ImmutableBytesWritable(rowKey);
Put put = new Put(Bytes.toBytes(data.getString("uid")));
for (int i = 0; i < data.size(); i++) {
put.addColumn(Bytes.toBytes("student"), Bytes.toBytes(headers[i]), data.getLong("timestamp"),
Bytes.toBytes(data.getString(headers[i])));
}
context.write(HKey, put);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
//加载自己的hbse-site.xml
conf.addResource(new File(args[3]).toURI().toURL());
String inputPath = args[0];
String outputPath = args[1];
HTable hTable = null;
try {
conf.set(HConstants.TEMPORARY_FS_DIRECTORY_KEY, "/user/temp");
Job job = Job.getInstance(conf, "import_hbase");
job.setJarByClass(BulkLoadJob.class);
job.setMapperClass(BulkLoadJob.BulkLoadMap.class);
job.setMapOutputKeyClass(ImmutableBytesWritable.class);
job.setMapOutputValueClass(Put.class);
job.setOutputKeyClass(ImmutableBytesWritable.class);
job.setOutputValueClass(Put.class);
// speculation
job.setSpeculativeExecution(false);
job.setReduceSpeculativeExecution(false);
// in/out format
FileInputFormat.setInputPaths(job, new Path(inputPath));
FileOutputFormat.setOutputPath(job, new Path(outputPath));
//CsvTextInputFormat.class 自定义CSV类
job.setInputFormatClass(CsvTextInputFormat.class);
job.setOutputFormatClass(HFileOutputFormat2.class);
hTable = new HTable(conf, args[2]);
HFileOutputFormat2.configureIncrementalLoad(job, hTable);
if (job.waitForCompletion(true)) {
FsShell shell = new FsShell(conf);
try {
shell.run(new String[] { "-chmod", "-R", "777", args[1] });
} catch (Exception e) {
logger.error("Couldnt change the file permissions ", e);
throw new IOException(e);
}
// 载入到hbase表
LoadIncrementalHFiles loader = new LoadIncrementalHFiles(conf);
loader.doBulkLoad(new Path(outputPath), hTable);
} else {
logger.error("loading failed.");
System.exit(1);
}
} catch (IllegalArgumentException e) {
e.printStackTrace();
} finally {
if (hTable != null) {
hTable.close();
}
}
}
}
此代码参考 地址.
感谢公司大佬指导
pom.xml 依赖部分
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>1.1.7</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>1.1.7</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-common</artifactId>
<version>1.1.7</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-mapreduce</artifactId>
<version>2.2.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.6.5</version>
</dependency>
<dependency>
<groupId>net.sourceforge.javacsv</groupId>
<artifactId>javacsv</artifactId>
<version>2.0</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-csv</artifactId>
<version>1.5</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.4</version>
</dependency>
</dependencies>
执行语句
hadoop jar hbaseload.jar /user/hbase_import/old /user/hbase_import/new tablename /hbase-site.xml
Hbase查询结果
hbase shell 进入hbase界面
list 查看所有表
scan ‘tablename’ 可查看所有信息
scan ‘tablename’ ,FILTER=>“ValueFilter(=,‘binary:2001-01-01’)” 查看包含此内容的信息
get ‘tablename’,‘0001’,{COLUMN=>‘student’,VERSIONS=>20,TIMESTAMP=>1565764353} 查看指定RowKey、时间戳的信息
get ‘tablename’,‘0001’,{COLUMN=>‘student’,VERSIONS=>20,TIMERANGE=>[1565764353,1565764380]} 查看指定RowKey、时间戳范围内的信息