读取本地ORC文件,返回OrcStruct列表

源代码:https://github.com/narata/tools

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.orc.OrcFile;
import org.apache.orc.Reader;
import org.apache.orc.RecordReader;
import org.apache.orc.TypeDescription;
import org.apache.orc.mapred.OrcMapredRecordReader;
import org.apache.orc.mapred.OrcStruct;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 * @author narata
 * @since 2019/02/21
 */
public class OrcUtils {
	/**
	 * 根据本地Orc文件返回 OrcStruct List
	 * @param filename 本地文件名
	 * @return List OrcStruct
	 * @throws IOException
	 */
	public static List<OrcStruct> localOrcFileToList(String filename) throws IOException {
		Path testFilePath = new Path(filename);
		Configuration conf = new Configuration();
		Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf));
		RecordReader rows = reader.rows();
		TypeDescription schema = reader.getSchema();
		List<TypeDescription> children = schema.getChildren();
		VectorizedRowBatch batch = schema.createRowBatch();
		int numberOfChildren = children.size();
		List<OrcStruct> resultList = new ArrayList<>();
		while (rows.nextBatch(batch)) {
			for (int r = 0; r < batch.size; r++) {
				OrcStruct result = new OrcStruct(schema);
				for(int i=0; i < numberOfChildren; ++i) {
					result.setFieldValue(i, OrcMapredRecordReader.nextValue(batch.cols[i], 1,
							children.get(i), result.getFieldValue(i)));
				}
				resultList.add(result);
			}
		}
		rows.close();
		return resultList;
	}
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值