MapReduce读取sequencefile文件

最新推荐文章于 2021-01-08 19:24:40 发布

woloqun

最新推荐文章于 2021-01-08 19:24:40 发布

阅读量2.2k

点赞数

本文链接：https://blog.csdn.net/woloqun/article/details/77371403

版权

hadoop 同时被 2 个专栏收录

21 篇文章 0 订阅

订阅专栏

hive

12 篇文章 0 订阅

订阅专栏

创建sequencefile格式hive表

create table test_seqencefile(name string,age int) stored as SEQUENCEFILE


CREATE TABLE `test_seqencefile`(
  `name` string, 
  `age` int)
ROW FORMAT SERDE 
  'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' 
STORED AS INPUTFORMAT 
  'org.apache.hadoop.mapred.SequenceFileInputFormat' 
OUTPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat'
LOCATION
  'hdfs://localhost:9000/user/work/warehouse/test_seqencefile'
TBLPROPERTIES (
  'transient_lastDdlTime'='1502950306')

插入测试数据

insert into table test_seqencefile select name ,age from test limit 10;

执行hive插入语句的MR解析

在执行插入语句时的MR任务的configrration(从jobhistory中得到)

Mapper:org.apache.hadoop.hive.ql.exec.mr.ExecMapper
Reducer:org.apache.hadoop.hive.ql.exec.mr.ExecReducer	
mapreduce.job.output.key.class:org.apache.hadoop.io.Text
mapreduce.job.output.value.class:org.apache.hadoop.io.Text
file.outputformat:org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat

从hive表结构可以知道test_seqencefile序列化用的是org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe,由此可知字段分割用的是’\1’,文件输出格式是由org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat控制,查看代码知道

final SequenceFile.Writer outStream = Utilities.createSequenceWriter(jc, fs, finalOutPath,
    BytesWritable.class, valueClass, isCompressed, progress);

执行MR时,Key:BytesWritable,Value:Text,这里的Key的类型和jobhistory中的不一样(mapreduce.job.output.key.class:org.apache.hadoop.io.Text),这点也是很奇怪,这点很重要,在用MR解析sequencefile需要指定Key和Value的类型

MR读取hive表中的sequencefile文件

package com.fan.hadoop.seqencefile;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.io.IOException;

public class SeqenceFileReadMR {

    public static class SequenceFileMaper extends Mapper<BytesWritable,Text,Text,Text>{

        @Override
        protected void map(BytesWritable key, Text value, Context context)
                throws IOException, InterruptedException {
            String t = value.toString();
            String[] arr = t.split("\1");
            //hive默认情况下使用'\1'来分割字段
            context.write(new Text(arr[0]),new Text(arr[1]));
        }
    }

    public static void main(String[] args)  throws Exception {
        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);
        job.setJarByClass(SeqenceFileReadMR.class);
        job.setJobName("seqencefileread");

        String in = "hdfs://localhost:9000/user/work/warehouse/test_seqencefile";
        String out = "hdfs://localhost:9000/test/seqencefile1";

        job.setMapperClass(SequenceFileMaper.class);
        SequenceFileInputFormat.addInputPath(job, new Path(in));

        job.setInputFormatClass(SequenceFileInputFormat.class);
        //如果是在不知道key和value的类型,就用SequenceFileAsTextInputFormat,那Map的key和value的
        //类型就可已都指定为Text
//        job.setInputFormatClass(SequenceFileAsTextInputFormat.class);
        job.setNumReduceTasks(0);

        job.setMapOutputValueClass(Text.class);
        job.setMapOutputKeyClass(Text.class);

        job.setOutputFormatClass(TextOutputFormat.class);
        FileOutputFormat.setOutputPath(job, new Path(out));

        job.waitForCompletion(true);
    }

}

查看生成的文件

hadoop dfs -cat /test/seqencefile/part-m-00000

kafka   14
tensflow        98
hadoop  34
hbase   68
flume   57
kafka   99
kafka   28
flume   24
tensflow        35
flume   44

生成sequencefile文件

生成sequencefile文件,用SequenceFileOutputFormat,应为hive表中默认使用’\1’分割数据行,所以字段的值用’\1’连接,具体代码如下

package com.fan.hadoop.seqencefile;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;

import java.io.IOException;
import java.util.Random;
import java.util.StringTokenizer;


public class SeqenceFileWriterMR {

    public static class SequenceFileMaper extends Mapper<LongWritable,Text,LongWritable,Text>{

        private Text v = new Text();
        private Random random = new Random();


        @Override
        protected void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {

            String line = value.toString();
            StringTokenizer token = new StringTokenizer(line);
            while (token.hasMoreTokens()) {
                String s = token.nextToken();
                v.set(s+"\1"+random.nextInt(20));
                context.write(key, v);
            }
        }
    }



    public static void main(String[] args)  throws Exception {
        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);
        job.setJarByClass(SeqenceFileWriterMR.class);
        job.setJobName("seqencefileWriter");

        String in = "hdfs://localhost:9000/test/wordcount.txt";
        String out = "hdfs://localhost:9000/test/seqencefile2";

        job.setMapperClass(SequenceFileMaper.class);
        FileInputFormat.addInputPath(job, new Path(in));

        job.setInputFormatClass(TextInputFormat.class);

        job.setNumReduceTasks(0);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        SequenceFileOutputFormat.setOutputPath(job, new Path(out));

        job.waitForCompletion(true);
    }

}

将数据复制到hive的目录下

hadoop dfs -cp /test/seqencefile2/* /user/work/warehouse/test_seqencefile/

查询hive表

hive> select * from test_seqencefile limit 5;                  
OK
hello   18
world   6
hello   12
presto  2
hello   14

woloqun

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
MapReduce读取sequencefile文件

博客地址:http://www.fanlegefan.com 文章地址:http://www.fanlegefan.com/index.php/2017/08/18/mapreducesequencefile/创建sequencefile格式hive表create table test_seqencefile(name string,age int) stored as SEQUENCEFILE
复制链接

扫一扫

专栏目录