Mapreduce RCFile写入和读取API示例

最新推荐文章于 2024-06-21 13:46:36 发布

chengyuan2789

最新推荐文章于 2024-06-21 13:46:36 发布

阅读量98

点赞数

文章标签：大数据 java

原文链接：https://my.oschina.net/weikan/blog/711932

版权

RCFile是FaceBook开发的高压缩比、高效读的行列存储结构。通常在Hive中可以直接对一张Text表使用insert-select转换，但有时希望使用Mapreduce进行RCFile的读写。

<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.5.0-cdh5.2.1</version>
</dependency>

<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-serde</artifactId>
<version>0.13.1-cdh5.2.1</version>
</dependency>

<dependency>
<groupId>org.apache.hive.hcatalog</groupId>
<artifactId>hive-hcatalog-core</artifactId>
<version>0.13.1-cdh5.2.1</version>
</dependency>

读取文本文件，使用mapreduce生成RCFile格式文件

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;
import org.apache.hadoop.hive.serde2.columnar.BytesRefWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hive.hcatalog.rcfile.RCFileMapReduceInputFormat;

import java.io.IOException;

public class RcFileReaderJob {
static class RcFileMapper extends Mapper<Object, BytesRefArrayWritable, Text, NullWritable> {
@Override
protected void map(Object key, BytesRefArrayWritable value,
Context context)
throws IOException, InterruptedException {
Text txt = new Text();
StringBuffer sb = new StringBuffer();
for (int i = 0; i < value.size(); i++) {
BytesRefWritable v = value.get(i);
txt.set(v.getData(), v.getStart(), v.getLength());
if (i == value.size() - 1) {
sb.append(txt.toString());
} else {
sb.append(txt.toString() + "\t");
}
}
context.write(new Text(sb.toString()), NullWritable.get());
}

@Override
protected void cleanup(Context context) throws IOException,
InterruptedException {
super.cleanup(context);
}

@Override
protected void setup(Context context) throws IOException,
InterruptedException {
super.setup(context);

}
}

static class RcFileReduce extends Reducer<Text, NullWritable, Text, NullWritable> {
@Override
protected void reduce(Text key, Iterable<NullWritable> values,
Context context) throws IOException, InterruptedException {
context.write(key, NullWritable.get());
}
}

public static boolean runLoadMapReducue(Configuration conf, Path input, Path output) throws IOException,
ClassNotFoundException, InterruptedException {
Job job = Job.getInstance(conf);
job.setJarByClass(RcFileReaderJob.class);
job.setJobName("RcFileReaderJob");
job.setNumReduceTasks(1);
job.setMapperClass(RcFileMapper.class);
job.setReducerClass(RcFileReduce.class);
job.setInputFormatClass(RCFileMapReduceInputFormat.class);
// MultipleInputs.addInputPath(job, input, RCFileInputFormat.class);
RCFileMapReduceInputFormat.addInputPath(job, input);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
FileOutputFormat.setOutputPath(job, output);
return job.waitForCompletion(true);
}

public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
if (args.length != 2) {
System.err.println("Usage: rcfile <in> <out>");
System.exit(2);
}
RcFileReaderJob.runLoadMapReducue(conf, new Path(args[0]), new Path(args[1]));
}
}

读取RCFile格式文件，使用mapreduce生成Text格式文件

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;
import org.apache.hadoop.hive.serde2.columnar.BytesRefWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hive.hcatalog.rcfile.RCFileMapReduceOutputFormat;

import java.io.IOException;

public class RcFileWriterJob extends Configured implements Tool{
   public static class Map extends Mapper<Object, Text, NullWritable, BytesRefArrayWritable>{
       private byte[] fieldData;
       private int numCols;
       private BytesRefArrayWritable bytes;

       @Override
       protected void setup(Context context) throws IOException, InterruptedException {
           numCols = context.getConfiguration().getInt("hive.io.rcfile.column.number.conf", 0);
           bytes = new BytesRefArrayWritable(numCols);
       }

       public void map(Object key, Text line, Context context
) throws IOException, InterruptedException {
           bytes.clear();
           String[] cols = line.toString().split("\t", -1);
           System.out.println("SIZE : "+cols.length);
           for (int i=0; i<numCols; i++){
       fieldData = cols[i].getBytes("UTF-8");
       BytesRefWritable cu = new BytesRefWritable(fieldData, 0, fieldData.length);
   bytes.set(i, cu);
   }
           context.write(NullWritable.get(), bytes);
       }
   }

   public int run(String[] args) throws Exception {
       Configuration conf = new Configuration();
       String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
       if(otherArgs.length < 2){
       System.out.println("Usage: " +
               "hadoop jar RCFileLoader.jar <main class> " +
               "-tableName <tableName> -numCols <numberOfColumns> -input <input path> " +
               "-output <output path> -rowGroupSize <rowGroupSize> -ioBufferSize <ioBufferSize>");
       System.out.println("For test");
       System.out.println("$HADOOP jar RCFileLoader.jar edu.osu.cse.rsam.rcfile.mapreduce.LoadTable " +
               "-tableName test1 -numCols 10 -input RCFileLoaderTest/test1 " +
               "-output RCFileLoaderTest/RCFile_test1");
       System.out.println("$HADOOP jar RCFileLoader.jar edu.osu.cse.rsam.rcfile.mapreduce.LoadTable " +
               "-tableName test2 -numCols 5 -input RCFileLoaderTest/test2 " +
               "-output RCFileLoaderTest/RCFile_test2");
       return 2;
   }

       String tableName = "";
       int numCols = 0;
       String inputPath = "";
       String outputPath = "";
       int rowGroupSize = 16 *1024*1024;
       int ioBufferSize = 128*1024;
   for (int i=0; i<otherArgs.length - 1; i++){
       if("-tableName".equals(otherArgs[i])){
           tableName = otherArgs[i+1];
       }else if ("-numCols".equals(otherArgs[i])){
           numCols = Integer.parseInt(otherArgs[i+1]);
       }else if ("-input".equals(otherArgs[i])){
           inputPath = otherArgs[i+1];
       }else if("-output".equals(otherArgs[i])){
           outputPath = otherArgs[i+1];
       }else if("-rowGroupSize".equals(otherArgs[i])){
           rowGroupSize = Integer.parseInt(otherArgs[i+1]);
       }else if("-ioBufferSize".equals(otherArgs[i])){
           ioBufferSize = Integer.parseInt(otherArgs[i+1]);
       }

   }

   conf.setInt("hive.io.rcfile.record.buffer.size", rowGroupSize);
   conf.setInt("io.file.buffer.size", ioBufferSize);

       Job job = Job.getInstance(conf);
       job.setJobName("RcFileWriterJob");
   job.setJarByClass(RcFileWriterJob.class);
   job.setMapperClass(Map.class);
   job.setMapOutputKeyClass(NullWritable.class);
   job.setMapOutputValueClass(BytesRefArrayWritable.class);
//   job.setNumReduceTasks(0);

   FileInputFormat.addInputPath(job, new Path(inputPath));

   job.setOutputFormatClass(RCFileMapReduceOutputFormat.class);
   RCFileMapReduceOutputFormat.setColumnNumber(job.getConfiguration(), numCols);
   RCFileMapReduceOutputFormat.setOutputPath(job, new Path(outputPath));
   RCFileMapReduceOutputFormat.setCompressOutput(job, false);

   System.out.println("Loading table " + tableName + " from " + inputPath + " to RCFile located at " + outputPath);
   System.out.println("number of columns:" + job.getConfiguration().get("hive.io.rcfile.column.number.conf"));
   System.out.println("RCFile row group size:" + job.getConfiguration().get("hive.io.rcfile.record.buffer.size"));
   System.out.println("io bufer size:" + job.getConfiguration().get("io.file.buffer.size"));

   return (job.waitForCompletion(true) ? 0 : 1);
   }

   public static void main(String[] args) throws Exception {
   int res = ToolRunner.run(new Configuration(), new RcFileWriterJob(), args);
   System.exit(res);
   }

}

转载于:https://my.oschina.net/weikan/blog/711932

chengyuan2789

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Mapreduce RCFile写入和读取API示例

RCFile是FaceBook开发的高压缩比、高效读的行列存储结构。通常在Hive中可以直接对一张Text表使用insert-select转换，但有时希望使用Mapreduce进行RCFile的读写。 <dependency> <gr...
复制链接

扫一扫