java获取hadoop目录结构_Hadoop Map-Reduce程序读取OrcStruct类型的文件

该博客介绍了如何在Java中编写一个Map-Reduce程序,利用OrcInputFormat读取Hadoop集群上存储的OrcStruct类型的文件。程序包含一个Driver类设置输入输出路径、配置OrcInputFormat,以及一个Mapper类用于处理OrcStruct数据并写入结果。
摘要由CSDN通过智能技术生成

表结构:

6e22ef6ed35130010a433acb15dea528.png

map-reduce程序(只要map阶段,没有reduce):

Driver:

import java.time.LocalDate;

import java.time.format.DateTimeFormatter;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.conf.Configured;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.input.SequenceFileAsBinaryInputFormat;

import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.util.Tool;

import org.apache.hadoop.util.ToolRunner;

import org.apache.orc.mapreduce.OrcInputFormat;

public class EvBikeColorDriver extends Configured implements Tool {

@Override

public int run(String[] args) throws Exception {

LocalDate runDate = null;

DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd");

// 参数传入判断

if (args == null || args.length == 0) {

System.err.println("参数为空,请至少输入时间参数!!!!!!");

System.exit(1);

} else if (args.length == 1) {

runDate = LocalDate.parse(args[0], formatter);

} else if (args.length == 2) {

System.err.println("参数个数不能多于1个 !!!!");

System.exit(1);

}

// user/hive/dataware/dw/dw_bike_alert_status_clean/pt=xxx

String inputPath = "/xxxx/xxxxxxxin/pt=" + formatter.format(runDate).replace("-", "");

String outputPath = "/xxxx/xxxxxxout" + "/pt=" + formatter.format(runDate).replace("-", "");

System.out.println("输入路径: "+inputPath);

System.out.println("输出路径: "+outputPath);

Configuration conf = getConf();

conf.set("orc.mapred.output.schema","struct");

FileSystem fs = FileSystem.get(conf);

conf.set("inputPath", inputPath);

conf.set("outputPath", outputPath);

Job job = Job.getInstance(conf, "OrcInputFormat reader");

job.setJarByClass(MyDriver.class);

// 设置Map

job.setMapperClass(MyMapper.class);

job.setMapOutputKeyClass(Text.class);

job.setMapOutputValueClass(Text.class);

FileInputFormat.addInputPaths(job, inputPath);

//设置Orc格式

job.setInputFormatClass(OrcInputFormat.class);

FileOutputFormat.setOutputPath(job, new Path(outputPath));

// mapreduce输出的临时目录删除

if (fs.exists(new Path(outputPath))) {

fs.delete(new Path(outputPath), true);

}

return job.waitForCompletion(true) ? 0 : 1;

}

public static void main(String[] args) throws Exception {

Configuration configuration = new Configuration();

int status = ToolRunner.run(configuration, new MyDriver(), args);

System.exit(status);

}

}

Mapper:

import java.io.IOException;

import org.apache.hadoop.io.NullWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.orc.mapred.OrcStruct;

public class MyMapper extends Mapper {

@Override

protected void map(NullWritable key, OrcStruct vlaue, Context context) throws IOException, InterruptedException {

//vlaue.getFieldValue(0),vlaue.getFieldValue(1),vlaue.getFieldValue(2)代表列index

if(vlaue.getFieldValue(0) != null && vlaue.getFieldValue(1) != null) {

context.write((Text)vlaue.getFieldValue(1), (Text)vlaue.getFieldValue(2));

}

}

}

转载至链接:https://my.oschina.net/u/2338224/blog/1830266

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值