MapReduce读取sequencefile文件

12 篇文章 0 订阅
创建sequencefile格式hive表
create table test_seqencefile(name string,age int) stored as SEQUENCEFILE


CREATE TABLE `test_seqencefile`(
  `name` string, 
  `age` int)
ROW FORMAT SERDE 
  'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' 
STORED AS INPUTFORMAT 
  'org.apache.hadoop.mapred.SequenceFileInputFormat' 
OUTPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat'
LOCATION
  'hdfs://localhost:9000/user/work/warehouse/test_seqencefile'
TBLPROPERTIES (
  'transient_lastDdlTime'='1502950306')
插入测试数据
insert into table test_seqencefile select name ,age from test limit 10;
执行hive插入语句的MR解析

在执行插入语句时的MR任务的configrration(从jobhistory中得到)

Mapper:org.apache.hadoop.hive.ql.exec.mr.ExecMapper
Reducer:org.apache.hadoop.hive.ql.exec.mr.ExecReducer	
mapreduce.job.output.key.class:org.apache.hadoop.io.Text
mapreduce.job.output.value.class:org.apache.hadoop.io.Text
file.outputformat:org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat

从hive表结构可以知道test_seqencefile序列化用的是org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe,由此可知字段分割用的是’\1’,文件输出格式是由org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat控制,查看代码知道

final SequenceFile.Writer outStream = Utilities.createSequenceWriter(jc, fs, finalOutPath,
    BytesWritable.class, valueClass, isCompressed, progress);

执行MR时,Key:BytesWritable,Value:Text,这里的Key的类型和jobhistory中的不一样(mapreduce.job.output.key.class:org.apache.hadoop.io.Text),这点也是很奇怪,这点很重要,在用MR解析sequencefile需要指定Key和Value的类型

MR读取hive表中的sequencefile文件
package com.fan.hadoop.seqencefile;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.io.IOException;

public class SeqenceFileReadMR {

    public static class SequenceFileMaper extends Mapper<BytesWritable,Text,Text,Text>{

        @Override
        protected void map(BytesWritable key, Text value, Context context)
                throws IOException, InterruptedException {
            String t = value.toString();
            String[] arr = t.split("\1");
            //hive默认情况下使用'\1'来分割字段
            context.write(new Text(arr[0]),new Text(arr[1]));
        }
    }

    public static void main(String[] args)  throws Exception {
        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);
        job.setJarByClass(SeqenceFileReadMR.class);
        job.setJobName("seqencefileread");

        String in = "hdfs://localhost:9000/user/work/warehouse/test_seqencefile";
        String out = "hdfs://localhost:9000/test/seqencefile1";

        job.setMapperClass(SequenceFileMaper.class);
        SequenceFileInputFormat.addInputPath(job, new Path(in));

        job.setInputFormatClass(SequenceFileInputFormat.class);
        //如果是在不知道key和value的类型,就用SequenceFileAsTextInputFormat,那Map的key和value的
        //类型就可已都指定为Text
//        job.setInputFormatClass(SequenceFileAsTextInputFormat.class);
        job.setNumReduceTasks(0);

        job.setMapOutputValueClass(Text.class);
        job.setMapOutputKeyClass(Text.class);

        job.setOutputFormatClass(TextOutputFormat.class);
        FileOutputFormat.setOutputPath(job, new Path(out));

        job.waitForCompletion(true);
    }

}

查看生成的文件
hadoop dfs -cat /test/seqencefile/part-m-00000

kafka   14
tensflow        98
hadoop  34
hbase   68
flume   57
kafka   99
kafka   28
flume   24
tensflow        35
flume   44
生成sequencefile文件

生成sequencefile文件,用SequenceFileOutputFormat,应为hive表中默认使用’\1’分割数据行,所以字段的值用’\1’连接,具体代码如下

package com.fan.hadoop.seqencefile;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;

import java.io.IOException;
import java.util.Random;
import java.util.StringTokenizer;


public class SeqenceFileWriterMR {

    public static class SequenceFileMaper extends Mapper<LongWritable,Text,LongWritable,Text>{

        private Text v = new Text();
        private Random random = new Random();


        @Override
        protected void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {

            String line = value.toString();
            StringTokenizer token = new StringTokenizer(line);
            while (token.hasMoreTokens()) {
                String s = token.nextToken();
                v.set(s+"\1"+random.nextInt(20));
                context.write(key, v);
            }
        }
    }



    public static void main(String[] args)  throws Exception {
        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);
        job.setJarByClass(SeqenceFileWriterMR.class);
        job.setJobName("seqencefileWriter");

        String in = "hdfs://localhost:9000/test/wordcount.txt";
        String out = "hdfs://localhost:9000/test/seqencefile2";

        job.setMapperClass(SequenceFileMaper.class);
        FileInputFormat.addInputPath(job, new Path(in));

        job.setInputFormatClass(TextInputFormat.class);

        job.setNumReduceTasks(0);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        SequenceFileOutputFormat.setOutputPath(job, new Path(out));

        job.waitForCompletion(true);
    }

}

将数据复制到hive的目录下
hadoop dfs -cp /test/seqencefile2/* /user/work/warehouse/test_seqencefile/
查询hive表
hive> select * from test_seqencefile limit 5;                  
OK
hello   18
world   6
hello   12
presto  2
hello   14
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值