java mapreduce 读hbase数据写入hdfs 含maven依赖

最新推荐文章于 2024-07-03 10:51:18 发布

MIDSUMMER_yy

最新推荐文章于 2024-07-03 10:51:18 发布

阅读量6k

点赞数 2

分类专栏：大数据文章标签： hbase mapreduce java maven

本文链接：https://blog.csdn.net/qq_38151907/article/details/125979747

版权

大数据专栏收录该内容

26 篇文章 2 订阅

订阅专栏

mapreduce 读hbase数据写入hdfs
java代码如下

import com.google.common.collect.Lists;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
import java.util.List;
import java.util.Properties;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.Logger;

public class HbaseToHdfs
{
    private static final Logger logger = Logger.getLogger(HbaseToHdfs03.class);
    public static Configuration conf = HBaseConfiguration.create();

    static {
    //hbase的ip和端口
        conf.set("hbase.master", "xxx:xxx");
        conf.set("mapreduce.output.fileoutputformat.compress", "false");//不进行压缩
        //hadoop的配置文件位置
        conf.addResource(new Path("xxx/core-site.xml"));
        conf.addResource(new Path("xxx/hdfs-site.xml"));
        //hbase的配置文件位置
        conf.addResource(new Path("xxx/hbase-site.xml"));
        conf.set("hbase.client.pause","2000");
        conf.set("hbase.client.retries.number","100");
        conf.set("hbase.client.operation.timeout","500000");
    }
    public static void main(String[] args)
            throws Exception
    {
        InputStream foin = new FileInputStream(args[2]);
        Properties prop = new Properties();
        prop.load(foin);
        foin.close();
        String cloumns = prop.getProperty(args[0]).trim();
        conf.set("cloumns", cloumns);
        //xxx为自己起的任务名
        Job job = Job.getInstance(conf, "xxx");
        job.setJarByClass(HbaseToHdfs.class);
        job.setMapperClass(HbaseToHdfs.MyMapper.class);
        job.setNumReduceTasks(0);
        TableMapReduceUtil.initTableMapperJob(initScans(job, args[0]), HbaseToHdfs03.MyMapper.class,
                NullWritable.class, Text.class, job);
        FileSystem fs = FileSystem.get(conf);
        if (fs.exists(new Path(args[1]))) {
            fs.delete(new Path(args[1]));
        }
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        long start = System.currentTimeMillis();

        try
        {
            job.waitForCompletion(true);
        }
        finally
        {
            fs.setPermission(new Path(args[1]), new FsPermission("777"));
            FileStatus[] files = fs.listStatus(new Path(args[1]));
            for (FileStatus fileStatus : files)
            {
                Path p = fileStatus.getPath();
                fs.setPermission(p, new FsPermission("777"));
            }
            fs.close();
            long end = System.currentTimeMillis();
            logger.info("Job<" + job.getJobName() + ">是否执行成功:" + job.isSuccessful() + ";开始时间:" + start + "; 结束时间:" + end + "; 用时:" + (end - start) + "ms");
        }
    }

    private static List<Scan> initScans(Job job, String tableName)
    {
        Configuration conf = job.getConfiguration();
        Scan scan = new Scan();
        scan.setAttribute("scan.attributes.table.name", Bytes.toBytes(tableName));
        return Lists.newArrayList(new Scan[] { scan });
    }

    public static class MyMapper
            extends TableMapper<NullWritable, Text>
    {
        String cloumns = "";

        protected void map(ImmutableBytesWritable key, Result r, Mapper<ImmutableBytesWritable, Result, NullWritable, Text>.Context context)
                throws IOException, InterruptedException
        {
            if (r != null)
            {
                String all = "";
                int j = 0;
                for (String cloumn : this.cloumns.split(","))
                {
                    j++;
                    String s = "";
                    try
                    {
                    //xxx为列簇名
                        byte[] p = r.getValue("xxx".getBytes(), cloumn.getBytes());
                        if (p != null)
                        {
                        	//设置编码集及去除一下跟分隔符冲突的内容，这里可以自定义
                            s = new String(p, "UTF-8");
                            s = s.replaceAll("\\n", "").replaceAll("\\r", "");
                            s = s.replaceAll(",", ".");
                            s = s.replaceAll(";", ".");
                            if ("NULL".equals(s)) {
                                s = "";
                            }
                        }
                    }
                    catch (Exception e)
                    {
                        System.out.println("111");
                        s = "";
                    }
                    if (j == 1) {
                        all = s;
                    } else {
                    	//这里设置到hdfs的分隔符为逗号
                        all = all + "," + s;
                    }
                }
                context.write(NullWritable.get(), new Text(all));
            }
        }

        protected void setup(Mapper<ImmutableBytesWritable, Result, NullWritable, Text>.Context context)
                throws IOException, InterruptedException
        {
            Configuration conf = context.getConfiguration();
            this.cloumns = conf.get("cloumns");
        }
    }
}

使用方法如下：
传入三个参数
第一个为hbase的表名
第二个为hdfs的写入路径
第三个为一个配置文件
里面写的格式为：
hbase表名=准备写到hive的字段列表，以逗号分开
maven依赖如下：

<dependencies>

        <dependency>
            <groupId>org.apache.parquet</groupId>
            <artifactId>parquet-hadoop</artifactId>
            <version>1.10.0</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-hdfs -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>2.6.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>2.6.0</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-core -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-core</artifactId>
            <version>2.6.0</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.apache.hbase/hbase-client -->
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-client</artifactId>
            <version>1.0.0-cdh5.5.0</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.apache.hbase/hbase-common -->
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-common</artifactId>
            <version>1.0.0-cdh5.5.0</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.apache.hbase/hbase -->
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase</artifactId>
            <version>1.0.0-cdh5.5.0</version>
            <type>pom</type>
        </dependency>

        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-protocol</artifactId>
            <version>1.0.0-cdh5.5.0</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-server</artifactId>
            <version>1.0.0-cdh5.5.0</version>
        </dependency>
    </dependencies>