parquet-thrfit 数据读写以及hive表读取

12 篇文章 0 订阅

博客地址:http://www.fanlegefan.com
文章地址:http://www.fanlegefan.com/archives/parquet-thrfit-hive/


摘要

本文将用MapReduce方式读写parquet-thrfit格式文件,并将数据导入到hive表中进行查询操

定义person.thrift文件

namespace java com.fan.hadoop.parquet.thrift

struct Person{
    1:optional string name;
    2:optional i32 age;
}
生成java bean
thrift --gen java person.thrift
生成路径如下
├── gen-java
│   └── com
│       └── fan
│           └── hadoop
│               └── parquet
│                   └── thrift
│                       └── Person.java
└── person.thrift

将Person.java复制到项目中去

用MR写parquet-thrift格式文件

package com.fan.hadoop.parquet.thrift;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.hadoop.thrift.ParquetThriftOutputFormat;

import java.io.IOException;

/**
 * Created by http://www.fanlegefan.com on 17-7-21.
 */
public class ParquetThriftWriterMR {

    public static class ParthriftMap extends
            Mapper<LongWritable, Text, Void, Person> {
        private Text word = new Text();

        public void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {

            String[] arr = value.toString().split(",");
            Person p = new Person();
            p.setName(arr[0]);
            p.setAge(Integer.valueOf(arr[1].trim()));
            context.write(null,p);

        }
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);
        job.setJarByClass(ParquetThriftWriterMR.class);
        job.setJobName("parquetthrfit");

        String in = "hdfs://localhost:9000/test/user.txt";
        String out = "hdfs://localhost:9000/test/wdthrfit";

        job.setMapperClass(ParthriftMap.class);
        TextInputFormat.addInputPath(job, new Path(in));
        job.setInputFormatClass(TextInputFormat.class);
        job.setNumReduceTasks(0);

        job.setOutputFormatClass(ParquetThriftOutputFormat.class);
        ParquetThriftOutputFormat.setCompression(job, CompressionCodecName.GZIP);
        ParquetThriftOutputFormat.setOutputPath(job, new Path(out));
        ParquetThriftOutputFormat.setThriftClass(job, Person.class);

        job.waitForCompletion(true);
    }
}
生成的文件如下
hadoop dfs -ls /test/wdthrfit

Found 4 items
-rw-r--r--   3 work supergroup          0 2017-07-21 11:51 /test/wdthrfit/_SUCCESS
-rw-r--r--   3 work supergroup        529 2017-07-21 11:51 /test/wdthrfit/_common_metadata
-rw-r--r--   3 work supergroup        680 2017-07-21 11:51 /test/wdthrfit/_metadata
-rw-r--r--   3 work supergroup        776 2017-07-21 11:51 /test/wdthrfit/part-m-00000.gz.parquet

读取parquet-thrift格式文件

package com.fan.hadoop.parquet.thrift;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.parquet.hadoop.thrift.ParquetThriftInputFormat;
import java.io.IOException;

/**
 * Created by http://www.fanlegefan.com on 17-7-21.
 */
public class ParquetThriftReaderMR {

    public static class ParthriftMap extends
            Mapper<Void, Person,  LongWritable, Text> {
        private Text p = new Text();

        public void map(Void key, Person value, Context context)
                throws IOException, InterruptedException {
            p.set(value.getName()+","+value.getAge());
            context.write(null, p);

        }
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);
        job.setJarByClass(ParquetThriftReaderMR.class);
        job.setJobName("parquetthrfit");

        String in = "hdfs://localhost:9000/test/wdthrfit";
        String out = "hdfs://localhost:9000/test/wdthriftread";

        job.setMapperClass(ParthriftMap.class);

        job.setInputFormatClass(ParquetThriftInputFormat.class);
        ParquetThriftInputFormat.setInputPaths(job, new Path(in));
        job.setNumReduceTasks(0);

        job.setOutputFormatClass(TextOutputFormat.class);
        TextOutputFormat.setOutputPath(job, new Path(out));

        job.waitForCompletion(true);
    }
}
查看生成的文本文件内容
hadoop dfs -cat /test/wdthriftread/part-m-00000

leo,20
jack,10
jim,30
andy,30

用hive表读取parquet-thrift格式文件

创建parquet格式表
create table test_parquet(name string,age int) stored as parquet

查看表结构

hive> show create table test_parquet;
OK
CREATE TABLE `test_parquet`(
  `name` string, 
  `age` int)
ROW FORMAT SERDE 
  'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
OUTPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
LOCATION
  'hdfs://localhost:9000/user/hive/warehouse/test_parquet'
TBLPROPERTIES (
  'transient_lastDdlTime'='1495038003')

将之间生成的文件导入到hive表的数据目录下

hadoop dfs -cp /test/wdthrfit/*  /user/hive/warehouse/test_parquet/

hive读取数据

hive> select name,age from test_parquet;
OK
leo 20
jack    10
jim 30
andy    30
Time taken: 0.049 seconds, Fetched: 4 row(s)
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值