parquet-thrfit 数据读写以及hive表读取

最新推荐文章于 2024-06-28 13:05:00 发布

woloqun

最新推荐文章于 2024-06-28 13:05:00 发布

阅读量2.2k

点赞数

分类专栏： hadoop hive 文章标签： mapreduce parquet thrift hive

本文链接：https://blog.csdn.net/woloqun/article/details/76615797

版权

hadoop 同时被 2 个专栏收录

21 篇文章 0 订阅

订阅专栏

hive

12 篇文章 0 订阅

订阅专栏

博客地址:http://www.fanlegefan.com
文章地址:http://www.fanlegefan.com/archives/parquet-thrfit-hive/

摘要

本文将用MapReduce方式读写parquet-thrfit格式文件，并将数据导入到hive表中进行查询操

定义person.thrift文件

namespace java com.fan.hadoop.parquet.thrift

struct Person{
    1:optional string name;
    2:optional i32 age;
}

生成java bean

thrift --gen java person.thrift

生成路径如下

├── gen-java
│   └── com
│       └── fan
│           └── hadoop
│               └── parquet
│                   └── thrift
│                       └── Person.java
└── person.thrift

将Person.java复制到项目中去

用MR写parquet-thrift格式文件

package com.fan.hadoop.parquet.thrift;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.hadoop.thrift.ParquetThriftOutputFormat;

import java.io.IOException;

/**
 * Created by http://www.fanlegefan.com on 17-7-21.
 */
public class ParquetThriftWriterMR {

    public static class ParthriftMap extends
            Mapper<LongWritable, Text, Void, Person> {
        private Text word = new Text();

        public void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {

            String[] arr = value.toString().split(",");
            Person p = new Person();
            p.setName(arr[0]);
            p.setAge(Integer.valueOf(arr[1].trim()));
            context.write(null,p);

        }
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);
        job.setJarByClass(ParquetThriftWriterMR.class);
        job.setJobName("parquetthrfit");

        String in = "hdfs://localhost:9000/test/user.txt";
        String out = "hdfs://localhost:9000/test/wdthrfit";

        job.setMapperClass(ParthriftMap.class);
        TextInputFormat.addInputPath(job, new Path(in));
        job.setInputFormatClass(TextInputFormat.class);
        job.setNumReduceTasks(0);

        job.setOutputFormatClass(ParquetThriftOutputFormat.class);
        ParquetThriftOutputFormat.setCompression(job, CompressionCodecName.GZIP);
        ParquetThriftOutputFormat.setOutputPath(job, new Path(out));
        ParquetThriftOutputFormat.setThriftClass(job, Person.class);

        job.waitForCompletion(true);
    }
}

生成的文件如下

hadoop dfs -ls /test/wdthrfit

Found 4 items
-rw-r--r--   3 work supergroup          0 2017-07-21 11:51 /test/wdthrfit/_SUCCESS
-rw-r--r--   3 work supergroup        529 2017-07-21 11:51 /test/wdthrfit/_common_metadata
-rw-r--r--   3 work supergroup        680 2017-07-21 11:51 /test/wdthrfit/_metadata
-rw-r--r--   3 work supergroup        776 2017-07-21 11:51 /test/wdthrfit/part-m-00000.gz.parquet

读取parquet-thrift格式文件

package com.fan.hadoop.parquet.thrift;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.parquet.hadoop.thrift.ParquetThriftInputFormat;
import java.io.IOException;

/**
 * Created by http://www.fanlegefan.com on 17-7-21.
 */
public class ParquetThriftReaderMR {

    public static class ParthriftMap extends
            Mapper<Void, Person,  LongWritable, Text> {
        private Text p = new Text();

        public void map(Void key, Person value, Context context)
                throws IOException, InterruptedException {
            p.set(value.getName()+","+value.getAge());
            context.write(null, p);

        }
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);
        job.setJarByClass(ParquetThriftReaderMR.class);
        job.setJobName("parquetthrfit");

        String in = "hdfs://localhost:9000/test/wdthrfit";
        String out = "hdfs://localhost:9000/test/wdthriftread";

        job.setMapperClass(ParthriftMap.class);

        job.setInputFormatClass(ParquetThriftInputFormat.class);
        ParquetThriftInputFormat.setInputPaths(job, new Path(in));
        job.setNumReduceTasks(0);

        job.setOutputFormatClass(TextOutputFormat.class);
        TextOutputFormat.setOutputPath(job, new Path(out));

        job.waitForCompletion(true);
    }
}

查看生成的文本文件内容

hadoop dfs -cat /test/wdthriftread/part-m-00000

leo,20
jack,10
jim,30
andy,30

用hive表读取parquet-thrift格式文件

创建parquet格式表

create table test_parquet(name string,age int) stored as parquet

查看表结构

hive> show create table test_parquet;
OK
CREATE TABLE `test_parquet`(
  `name` string, 
  `age` int)
ROW FORMAT SERDE 
  'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
OUTPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
LOCATION
  'hdfs://localhost:9000/user/hive/warehouse/test_parquet'
TBLPROPERTIES (
  'transient_lastDdlTime'='1495038003')

将之间生成的文件导入到hive表的数据目录下

hadoop dfs -cp /test/wdthrfit/*  /user/hive/warehouse/test_parquet/

hive读取数据

hive> select name,age from test_parquet;
OK
leo 20
jack    10
jim 30
andy    30
Time taken: 0.049 seconds, Fetched: 4 row(s)

woloqun

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
parquet-thrfit 数据读写以及hive表读取

博客地址:http://www.fanlegefan.com 文章地址:http://www.fanlegefan.com/index.php/2017/07/21/parquet-thrfit-hive/摘要本文将用MapReduce方式读写parquet-thrfit格式文件，并将数据导入到hive表中进行查询操定义person.thrift文件namespace java com.fan.h
复制链接

扫一扫

专栏目录