HCatalog提供了并行输入和输出的数据传输API,这个API提供了一种方法从一个Hadoop集群来读取数据或写数据到Hadoop集群。
数据传输API有三个重要的类:
HCatReader——从一个Hadoop集群读取数据
HCatWriter——写数据到Hadoop集群
DataTransferFactory——生成读和写的实例
数据传输API辅助类包括:
ReadEntity
ReaderContext
WriteEntity
WriterContext
注意:HCatalog不是线程安全的。
HCatReader
Reading是一个两步的过程中,第一步发生在主节点。第二步是在多个从节点并行完成的。
Reads在“ReadEntity”上完成。在你开始进行读操作之前,您需要定义一个ReadEntity(使用ReadEntity.Builder),可以指定一个数据库名、表名,分区,
操作之前需要存在。
HCatWriter
类似于Reading,Writing也是一个两步的过程中,第一步发生在主节点上。随后,第二步在从节点上并行发生。
在从节点,使用WriterContext 来获得HCatWriter :
数据传输API有三个重要的类:
HCatReader——从一个Hadoop集群读取数据
HCatWriter——写数据到Hadoop集群
DataTransferFactory——生成读和写的实例
数据传输API辅助类包括:
ReadEntity
ReaderContext
WriteEntity
WriterContext
注意:HCatalog不是线程安全的。
HCatReader
Reading是一个两步的过程中,第一步发生在主节点。第二步是在多个从节点并行完成的。
Reads在“ReadEntity”上完成。在你开始进行读操作之前,您需要定义一个ReadEntity(使用ReadEntity.Builder),可以指定一个数据库名、表名,分区,
和过滤的字符串。例如:
ReadEntity.Builder builder = new ReadEntity.Builder();
ReadEntity entity = builder.withDatabase("mydb").withTable("mytbl").build();
上边的代码段定义了一个ReadEntity对象(“entity”),包括数据库名为“mydb”,表名为“mytbl”。可以来读表的所有数据,注意的是这个表在进行
操作之前需要存在。
定义一个ReadEntity之后,通过ReadEntity和集群配置来获得HCatReader的实例:
HCatReader reader = DataTransferFactory.getHCatReader(entity, config);
下一步是通过reader来获得ReaderContext如下:
ReaderContext cntxt = reader.prepareRead();
上述所有步骤都发生在主节点。主节点然后序列化ReaderContext对象并将其发送给所有从节点。从节点然后使用读者上下文读取数据:
for(InputSplit split : readCntxt.getSplits()){
HCatReader reader = DataTransferFactory.getHCatReader(split,
readerCntxt.getConf());
Iterator<HCatRecord> itr = reader.read();
while(itr.hasNext()){
HCatRecord read = itr.next();
}
}
HCatWriter
类似于Reading,Writing也是一个两步的过程中,第一步发生在主节点上。随后,第二步在从节点上并行发生。
写操作在“WriteEntity”上完成,可以构造上类似读操作的方式:
WriteEntity.Builder builder = new WriteEntity.Builder();
WriteEntity entity = builder.withDatabase("mydb").withTable("mytbl").build();
创建WriteEntity后,下一步是获取WriterContext:
HCatWriter writer = DataTransferFactory.getHCatWriter(entity, config);
WriterContext info = writer.prepareWrite();
上述所有步骤都发生在主节点。主节点然后序列化WriterContext对象并使其可用于所有的从节点。
在从节点,使用WriterContext 来获得HCatWriter :
HCatWriter writer = DataTransferFactory.getHCatWriter(context);
然后writer调用write接口hCatRecordItr迭代器作为参数进行写操作
writer.write(hCatRecordItr);
writer然后调用getNext()在这个迭代器在一个循环中,写出所有的记录。
example:
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.hive.hcatalog.data;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.ql.CommandNeedRetryException;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hive.hcatalog.common.HCatException;
import org.apache.hive.hcatalog.data.transfer.DataTransferFactory;
import org.apache.hive.hcatalog.data.transfer.HCatReader;
import org.apache.hive.hcatalog.data.transfer.HCatWriter;
import org.apache.hive.hcatalog.data.transfer.ReadEntity;
import org.apache.hive.hcatalog.data.transfer.ReaderContext;
import org.apache.hive.hcatalog.data.transfer.WriteEntity;
import org.apache.hive.hcatalog.data.transfer.WriterContext;
import org.apache.hive.hcatalog.mapreduce.HCatBaseTest;
import org.junit.Assert;
import org.junit.Test;
public class TestReaderWriter extends HCatBaseTest {
@Test
public void test() throws MetaException, CommandNeedRetryException,
IOException, ClassNotFoundException {
driver.run("drop table mytbl");
driver.run("create table mytbl (a string, b int)");
Iterator<Entry<String, String>> itr = hiveConf.iterator();
Map<String, String> map = new HashMap<String, String>();
while (itr.hasNext()) {
Entry<String, String> kv = itr.next();
map.put(kv.getKey(), kv.getValue());
}
WriterContext cntxt = runsInMaster(map);
File writeCntxtFile = File.createTempFile("hcat-write", "temp");
writeCntxtFile.deleteOnExit();
// Serialize context.
ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(writeCntxtFile));
oos.writeObject(cntxt);
oos.flush();
oos.close();
// Now, deserialize it.
ObjectInputStream ois = new ObjectInputStream(new FileInputStream(writeCntxtFile));
cntxt = (WriterContext) ois.readObject();
ois.close();
runsInSlave(cntxt);
commit(map, true, cntxt);
ReaderContext readCntxt = runsInMaster(map, false);
File readCntxtFile = File.createTempFile("hcat-read", "temp");
readCntxtFile.deleteOnExit();
oos = new ObjectOutputStream(new FileOutputStream(readCntxtFile));
oos.writeObject(readCntxt);
oos.flush();
oos.close();
ois = new ObjectInputStream(new FileInputStream(readCntxtFile));
readCntxt = (ReaderContext) ois.readObject();
ois.close();
for (int i = 0; i < readCntxt.numSplits(); i++) {
runsInSlave(readCntxt, i);
}
}
private WriterContext runsInMaster(Map<String, String> config) throws HCatException {
WriteEntity.Builder builder = new WriteEntity.Builder();
WriteEntity entity = builder.withTable("mytbl").build();
HCatWriter writer = DataTransferFactory.getHCatWriter(entity, config);
WriterContext info = writer.prepareWrite();
return info;
}
private ReaderContext runsInMaster(Map<String, String> config, boolean bogus)
throws HCatException {
ReadEntity entity = new ReadEntity.Builder().withTable("mytbl").build();
HCatReader reader = DataTransferFactory.getHCatReader(entity, config);
ReaderContext cntxt = reader.prepareRead();
return cntxt;
}
private void runsInSlave(ReaderContext cntxt, int slaveNum) throws HCatException {
HCatReader reader = DataTransferFactory.getHCatReader(cntxt, slaveNum);
Iterator<HCatRecord> itr = reader.read();
int i = 1;
while (itr.hasNext()) {
HCatRecord read = itr.next();
HCatRecord written = getRecord(i++);
// Argh, HCatRecord doesnt implement equals()
Assert.assertTrue("Read: " + read.get(0) + "Written: " + written.get(0),
written.get(0).equals(read.get(0)));
Assert.assertTrue("Read: " + read.get(1) + "Written: " + written.get(1),
written.get(1).equals(read.get(1)));
Assert.assertEquals(2, read.size());
}
//Assert.assertFalse(itr.hasNext());
}
private void runsInSlave(WriterContext context) throws HCatException {
HCatWriter writer = DataTransferFactory.getHCatWriter(context);
writer.write(new HCatRecordItr());
}
private void commit(Map<String, String> config, boolean status,
WriterContext context) throws IOException {
WriteEntity.Builder builder = new WriteEntity.Builder();
WriteEntity entity = builder.withTable("mytbl").build();
HCatWriter writer = DataTransferFactory.getHCatWriter(entity, config);
if (status) {
writer.commit(context);
} else {
writer.abort(context);
}
}
private static HCatRecord getRecord(int i) {
List<Object> list = new ArrayList<Object>(2);
list.add("Row #: " + i);
list.add(i);
return new DefaultHCatRecord(list);
}
private static class HCatRecordItr implements Iterator<HCatRecord> {
int i = 0;
@Override
public boolean hasNext() {
return i++ < 100 ? true : false;
}
@Override
public HCatRecord next() {
return getRecord(i);
}
@Override
public void remove() {
throw new RuntimeException();
}
}
}