- Using Map Reduce to read/write table in HBase
install Hadoop & Hbase
install hadoop
install hbase
suppose you already have ‘hbase-1.3.0-src.tar.gz’ -> link
unzip hbase-1.3.0-bin.tar.gz to /usr/local
sudo tar zxvf hbase-1.3.0-bin.tar.gz -C /usr/local
- rename to ‘hbase’
sudo mv /usr/local/hbase-1.3.0/ /usr/local/hbase/
- modify file permissions (specifically speaking, modify the owner of ‘hbase’, after that you don’s hava to ‘sudo’ or enter password or meet many other problems)
sudo chown -R your_user_name /usr/local/hbase/
- Set the environment variable
sudo vim /etc/profile
//then append the following content
#set hbase path
export PATH=$PATH:/usr/local/hbase/bin
- enable the environment variable (only works in the current shell if you modify /ect/profile and ‘source’ it)
source /etc/profile
- configure hbase
vim /usr/local/hbase/conf/hbase-env.sh
//append content
export JAVA_HOME=/usr/lib/jvm/jdk1.8.0_101
export HBASE_MANAGES_ZK=true
vim /usr/local/hbase/conf/hbase-site.xml
//add this <property>s in <configuration>
<property>
<name>hbase.rootdir</name>
<value>hdfs://localhost:9000/hbase</value>
</property>
<property>
<name>hbase.cluster.distributed</name>
<value>true</value>
</property>
start hbase (make sure hadoop was started)
- using shell command ‘jps’ to list all the java process running on this PC
- after successfully starting hadoop, you will see ‘NameNode’ ‘DataNode’ ‘SecondaryNameNode’ ‘NodeManager’ ‘ResourceManager’ via ‘jps’ command
- then hbase, ‘HMaster’ ‘HRegionServer’
enter hbase shell
/usr/local/hbase/bin/hbase shell
2017-03-26 23:29:47,401 WARN [main] util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/usr/local/hbase/lib/slf4j-log4j12-1.7.5.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/local/hadoop/share/hadoop/common/lib/slf4j-log4j12-1.7.10.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
HBase Shell; enter 'help<RETURN>' for list of supported commands.
Type "exit<RETURN>" to leave the HBase Shell
Version 1.3.0, re359c76e8d9fd0d67396456f92bcbad9ecd7a710, Tue Jan 3 05:31:38 MSK 2017
hbase(main):001:0> list
TABLE
Course
SC
Student
person
test1
test2
6 row(s) in 0.2590 seconds
=> ["Course", "SC", "Student", "person", "test1", "test2"]
hbase(main):002:0> scan 'test1'
ROW COLUMN+CELL
s01 column=name:, timestamp=1490498667244, value=book1
s01 column=price:, timestamp=1490498677562, value=20
s02 column=name:, timestamp=1490498687903, value=book2
s02 column=price:, timestamp=1490498695027, value=30
s03 column=name:, timestamp=1490498707093, value=book3
s03 column=price:, timestamp=1490498702665, value=25
3 row(s) in 0.1630 seconds
hbase(main):003:0>
- using hbase shell -> link
install Eclipse and Hadoop-plugins
as for hadoop-eclipse-plugin, better to download it from github -> link
- the release version included by this github project works as well
Using Hadoop and Its Java API
/*
* using javaAPI to implement command line : ls
*/
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import java.io.*;
import java.text.SimpleDateFormat;
public static void main(String[] args) {
try {
Configuration conf = new Configuration();
conf.set("fs.default.name","hdfs://localhost:9000");
String remoteFilePath = "/user/username/asd.txt"; // HDFS path
FileSystem fs = FileSystem.get(conf);
Path remotePath = new Path(remoteFilePath);
FileStatus[] fileStatuses = fs.listStatus(remotePath); //ls
for (FileStatus s : fileStatuses) {
System.out.println("Path: " + s.getPath().toString());
System.out.println("Auth: " + s.getPermission().toString());
System.out.println("Size: " + s.getLen());
/* convert timrstramp to normal data time */
long timeStamp = s.getModificationTime();
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
String date = format.format(timeStamp);
System.out.println("Time: " + date);
}
fs.close();
} catch (Exception e) {
e.printStackTrace();
}
}
/*
* cat
*/
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import java.io.*;
Configuration conf = new Configuration();
conf.set("fs.default.name","hdfs://localhost:9000");
String remoteFilePath = "/user/username/asd.txt"; // HDFS path
FileSystem fs = FileSystem.get(conf);
Path remotePath = new Path(remoteFilePath);
FSDataInputStream in = fs.open(remotePath);
BufferedReader d = new BufferedReader(new InputStreamReader(in));
String line = null;
while ( (line = d.readLine()) != null ) {
System.out.println(line);
}
d.close();
in.close();
fs.close();
Using Hbase and Its Java API
/*
* implement hbase shell command : scan
* print content of a table
*/
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.*;
//publi static attribute
Configuration configuration; //public static
Connection connection; //public static
Admin admin; //public static
//init
configuration = HBaseConfiguration.create();
configuration.set("hbase.rootdir", "hdfs://localhost:9000/hbase");
connection = ConnectionFactory.createConnection(configuration);
admin = connection.getAdmin();
//scan
Table table = connection.getTable(TableName.valueOf(tableName));
Scan scan = new Scan();
ResultScanner scanner = table.getScanner(scan);
for (Result result : scanner) {
for(Cell cell : result.rawCells() ){
System.out.print("row : "+new String(CellUtil.cloneRow(cell)));
System.out.print("col family: "+new String(CellUtil.cloneFamily(cell)));
System.out.print("col qualifier: "+new String(CellUtil.cloneQualifier(cell)));
System.out.print("value: "+new String(CellUtil.cloneValue(cell)));
System.out.println("timestamp : "+cell.getTimestamp());
}
}
//close
admin.close();
connection.close();
MapReduce with file stored in your PC’s fs
official tutorial -> link
classical example ‘wordcount’
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordCount {
public static class TokenizerMapper
extends Mapper<Object, Text, Text, IntWritable>{
/*
* extends Mapper<Type_Key_In, Type_Value_In, Type_Key_Out, Type_Value_Out>
* @Type_Key_In type of input key from source data
* @Type_Value_In type of input value from source data
* @Type_Key_Out type of output key to intermediate data
* @Type_Value_Out type of output value to intermediate data
*/
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
/*
* this is an override function works as callback function
* for each key-value pair
* this function will be called to processing it
* using @context to write intermediate data which is also formated as key-value
* the intermediate data will be the reducer's input
* so, it is IMPROTENT to make sure that
* map's Type_Key_Out == reduce's Type_Key_In
* map's Type_Value_Out == reduce's Type_Value_In
*/
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one); //key = "word" value = 1
}
}
}
public static class IntSumReducer
extends Reducer<Text,IntWritable,Text,IntWritable> {
/*
* extends Reducer<Type_Key_In, Type_Value_In, Type_Key_Out, Type_Value_Out>
* @Type_Key_In type of input key from intermediate data
* @Type_Value_In type of input value from intermediate data
* @Type_Key_Out type of output key to desitination data
* @Type_Value_Out type of output value to desitination data
*/
private IntWritable result = new IntWritable();
/*
* this is an override function works as callback function too
* for each value which has the same key
* they will be Organized into key-values
* which means that one key was associated with many values
* for each key-values, this function will be called to process
* so the source data was divided by different key
* and processed by many recude() function
* using @context to write result
* /
public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(TokenizerMapper.class);
//job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
MapReduce with Hbase
- for Hbase, there are specialized ‘mapper’ and ‘reducer’ to inherit
- TableMapper -> link
- TableReducer -> link
official example -> link
suppose there is a table named ‘bookstore’ in hbase and it has two column ‘name’ and ‘price’
hbase shell > create 'bookstore','name','price'
hbase shell > put 'bookstore','s01','name','book1'
hbase shell > put 'bookstore','s01','price','20'
TableMapper
class HBaseSortMapper extends TableMapper<Text, Text>{
/*
* extends TableMapper<Type_Key_Out, Type_Value_Out>
* input from table, so we don't need Type_Key_In and Type_Value_In
* /
public void map(ImmutableBytesWritable row, Result value, Context context)
throws IOException, InterruptedException {
/*
* @row : byte[] row_name
* @value: Result value_of_this_row
* using getValue() to get each column value
* @context: to write intermediate data
* /
// getValue(byte[] family, byte[] qualifier)
// @family : colunm name
// http://hbase.apache.org/0.94/book/columnfamily.html
byte[] name_bytes = value.getValue("name".getBytes(), null );
if(name_bytes == null){
System.out.println("MAP cannot find_name");
}
String name = new String ( name_bytes );
System.out.println("MAP_get_name : " + name);
byte[] price_bytes = value.getValue("price".getBytes(), null);
if(price_bytes == null){
System.out.println("MAP cannot find_price");
}
String price = new String ( price_bytes );
System.out.println("MAP_get_price : " + price);
context.write(new Text("sort"), new Text(name + "-" + price) );
}
}
TableReducer
class HBaseSotrReducer extends TableReducer<Text, Text, ImmutableBytesWritable>{
/*
* extends TableReducer<Type_Key_In, Type_Value_In, Type_Output>
* TableReducer's Type_Key_In == TableMapper's Type_Key_Out
* TableReducer's Type_Value_In == TableMapper's Type_Value_Out
* /
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
/*
* @key: key word, in this example, key = "sort"
* @values: set of value writed by context in TableMapper
* @context: used to output
* /
HashMap<Integer,String> map_name = new HashMap<Integer,String>(); //store
LinkedList<Integer> list_price = new LinkedList<Integer>();
for (Text val : values) {
String[] val_str_words = val.toString().split("-");
//value format : name-price
String name = val_str_words[0];
String price_str = val_str_words[1];
int price = Integer.parseInt( price_str );
System.out.println("reduce_value : " + name + " " + price );
list_price.add(price);
map_name.put(price, name);
}
Collections.sort(list_price); //cannot sort correctly
//when there are books
//which have different name & same price
//if using HashMap<price, name>
int i = 0;
for(; i < list_price.size(); ++i){
int p = list_price.get(i);
String n = map_name.get(p);
System.out.println("put " + n + " " + p);
Put put = genPut(n,p); //using 'Put' to write data into Hbase
// https://hbase.apache.org/apidocs/org/apache/hadoop/hbase/client/Put.html
context.write(null, put);
}
}
private static int count = 0;
public Put genPut(String name,int price){ //get Put
String key = "S" + Integer.toString( count++ );
Put put = new Put( Bytes.toBytes( key.toString() ) );
put.addColumn("name".getBytes(), null, name.getBytes());
put.addColumn("price".getBytes(), null, Integer.toString(price).getBytes());
return put;
}
}
Driver
class HBaseSortDriver{
public HBaseSortDriver(){ }
public int run(String[] args)
throws IllegalArgumentException, IOException,
ClassNotFoundException, InterruptedException{
System.out.println("MapReduce started.");
Configuration conf = new Configuration();
Job job = Job.getInstance( conf , "HBaseSortDriver");
job.setJarByClass(MyDriver.class);
job.setMapperClass(HBaseSortMapper.class); //mapper
job.setReducerClass(MyReducer.class); //recuder
//job.setCombinerClass(MyCombiner.class); //combiner
//which is not needed
//because there is only one reducer
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
//input from file
//FileInputFormat.addInputPath(job, new Path(args[0]));
//output to file
//FileOutputFormat.setOutputPath(job, new Path(args[1]));
Scan scan = new Scan();
scan.setCaching(500); // 1 is the default in Scan, which will be bad for MapReduce jobs
scan.setCacheBlocks(false); // don't set to true for MR jobs
TableMapReduceUtil.initTableMapperJob(
"test1", // input HBase table name
scan, // Scan instance
HBaseSortMapper.class, // mapper
Text.class, // mapper output key
Text.class, // mapper output value
job);
TableMapReduceUtil.initTableReducerJob(
"test2", // output HBase table name
HBaseSotrReducer.class, //reducer
job);
return job.waitForCompletion(true)?0:1;
}
}