Flink生成Hfile

最新推荐文章于 2024-04-27 21:38:15 发布

风少年~

最新推荐文章于 2024-04-27 21:38:15 发布

阅读量1.6k

点赞数

分类专栏：大数据 demo

本文链接：https://blog.csdn.net/albg_boy/article/details/89098080

版权

demo 同时被 2 个专栏收录

36 篇文章 0 订阅

订阅专栏

大数据

23 篇文章 1 订阅

订阅专栏

提出需求：

团队为了统一技术栈，一致同意用Flink来进行对批和流计算统一处理。

问题来了：

Flink似乎相对spark来说还是很不完善，spark分分钟搞定的事情，在Flink里就需要动动脑子了。

参考spark;

object CreateHfile {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("CreateHfile").setMaster(args(0))
    val sc = new SparkContext(conf)

    val hbaseConf = HBaseConfiguration.create()
    //
    val rdd = sc.textFile(args(1))
        .flatMap(v =>{
                val x = new javaList[String]()
                for( a <- 1 to 9999){
                        x.add(v + "%04d".format(a))
                }
                x.toArray
        })
        .sortBy(v=>v.toString)
        .map(r =>(new ImmutableBytesWritable(Bytes.toBytes(r.toString)),
            new KeyValue(Bytes.toBytes(r.toString), Bytes.toBytes("phoneFamliy"), Bytes.toBytes("phoneCol"),System.currentTimeMillis(),KeyValue.Type.DeleteColumn)))

    rdd.saveAsNewAPIHadoopFile(args(2), classOf[ImmutableBytesWritable],classOf[KeyValue],classOf[HFileOutputFormat2], hbaseConf)
    sc.stop()

  }

}

再来看看MR

public class HFileCreateJob {
    private final static Logger log = LoggerFactory.getLogger(HFileCreateJob.class);

    public void run(String input,String output,String env) throws Exception {

        Configuration conf = new Configuration();
        if("dev".equals(env)){
            devHeader(conf) ;
        }

        try {
            // 运行前，删除已存在的中间输出目录
            try {
                FileSystem fs = FileSystem.get(URI.create(output), conf);
                fs.delete(new Path(output), true);
                fs.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
            Job job = Job.getInstance(conf, "HFileCreateJob");
            job.setJobName("Zhao@HFileCreateJob_V1.0");
            job.setJarByClass(HFileCreateJob.class);
            job.setMapperClass(HfileMapper.class);
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(Text.class);

            job.setReducerClass(HfileReducer.class);
            job.setOutputKeyClass(ImmutableBytesWritable.class);
            job.setOutputValueClass(KeyValue.class);
            FileInputFormat.addInputPath(job, new Path(input));
            FileOutputFormat.setOutputPath(job, new Path(output));
            job.setOutputFormatClass(HFileOutputFormat2.class);
            System.exit(job.waitForCompletion(true) ? 0 : 1);

        } catch (IOException e) {
            e.printStackTrace();
        }

    }

    private void devHeader(Configuration conf){
        // 本地测试提交到测试集群
        conf.set("mapreduce.app-submission.cross-platform", "true");
        conf.set("mapreduce.job.ubertask.enable", "true");
        conf.set("fs.defaultFS","hdfs://10.10.10.165:8020");
        conf.set("mapreduce.job.jar","E:\\intermult-hbase\\target\\intermulthbase-1.0-SNAPSHOT.jar");
        // 支持hdfs下目录含子目录
        conf.set("mapreduce.input.fileinputformat.input.dir.recursive", "true");
        System.setProperty("hadoop.home.dir", "D:\\soft\\developsoft\\Hadoop\\hadoop-2.6.5");
        System.setProperty("HADOOP_USER_NAME", "hdfs");
    }

public class HfileMapper extends Mapper<LongWritable, Text, Text, Text> {
    private String rowKeySalt =  ConfigFactory.load().getConfig("hfileCreate").getString("rowKeySalt") ;

    @Override
    protected void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException {
        String[] datas = value.toString().split("\\001");
        String content = value.toString().replaceAll("\\001","\\!\\@\\#\\$") ;
        Text rowKey = new Text(SHA256Util.getSHA256Str(datas[0] + rowKeySalt )) ;
        context.write(rowKey,new Text(content));
    }
}

public class HfileReducer extends Reducer<Text, Text, ImmutableBytesWritable, KeyValue> {
    private final static Logger logger = LoggerFactory.getLogger(HFileCreateJob.class);
    private Config env =  ConfigFactory.load().getConfig("hfileCreate") ;
    private String family =  env.getString("family") ;
    private String column=  env.getString("column") ;

    @Override
    protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, ImmutableBytesWritable, KeyValue>.Context context)
            throws IOException, InterruptedException {

        for (Text value : values) {
            try{
                String line = value.toString();
                logger.error("line : " + line);
                ImmutableBytesWritable rowkey = new ImmutableBytesWritable(key.toString().getBytes());
                KeyValue kv = new KeyValue(key.toString().getBytes(), this.family.getBytes(), column.getBytes() , line.getBytes());
                context.write(rowkey, kv);

            }catch (Exception e){
                logger.error("",e);
                e.printStackTrace();
            }

        }
    }
}

最后是Flink的方案：

import org.apache.flink.api.common.functions.RichGroupReduceFunction;
import org.apache.flink.api.common.operators.Order;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.hadoop.mapreduce.HadoopInputFormat;
import org.apache.flink.api.java.hadoop.mapreduce.HadoopOutputFormat;
import org.apache.flink.api.java.tuple.Tuple1;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;

/**
 * 用Flink生产Hfile
 * https://ci.apache.org/projects/flink/flink-docs-release-1.7/dev/batch/hadoop_compatibility.html
 * Created by geo on 2019/4/8. */
public class Application {

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        devHeader(conf);
        Job job = Job.getInstance(conf);

        // HDFS 输入
        HadoopInputFormat<LongWritable, Text> hadoopIF =
                new HadoopInputFormat<LongWritable, Text>(
                        new TextInputFormat(), LongWritable.class, Text.class, job
                );
        TextInputFormat.addInputPath(job, new Path("hdfs://2.2.2.2:8020/user/zhao/out0226/testHfile"));


        // Flink就干了这点事
        ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
        DataSet<Tuple2<LongWritable, Text>> textDataSet = env.createInput(hadoopIF);
        DataSet<Tuple2<ImmutableBytesWritable,Cell>> ds =  textDataSet.map(v-> Tuple1.of(v.f1.toString()))
                .returns(Types.TUPLE(Types.STRING))
                .groupBy(0)
                .sortGroup(0,Order.ASCENDING)
                .reduceGroup(new createHfile());

        // 设置输出类型
        job.setOutputKeyClass(ImmutableBytesWritable.class);
        job.setOutputValueClass(KeyValue.class);

        // 输出到HDFS
        HadoopOutputFormat<ImmutableBytesWritable, Cell> hadoopOF =
                new HadoopOutputFormat<ImmutableBytesWritable, Cell>(
                        new HFileOutputFormat2(), job
                );
        HFileOutputFormat2.setOutputPath(job, new Path("hdfs://10.111.32.165:8020/user/zhao/out0226/9/"));
        job.setOutputFormatClass(HFileOutputFormat2.class);
        ds.output(hadoopOF);
        env.execute();
    }

    // 生产 Tuple2<ImmutableBytesWritable,Cell>
    public static final class createHfile extends RichGroupReduceFunction<Tuple1<String>, Tuple2<ImmutableBytesWritable,Cell>> {

        @Override
        public void reduce(Iterable<Tuple1<String>> values, Collector<Tuple2<ImmutableBytesWritable, Cell>> out) throws Exception {
            String family="datasfamily";
            String column="content";
            for (Tuple1<String> key:values) {
                ImmutableBytesWritable rowkey = new ImmutableBytesWritable(key.toString().getBytes());
                KeyValue kv = new KeyValue(key.toString().getBytes(), family.getBytes(), column.getBytes() , key.f0.getBytes());
                out.collect(Tuple2.of(rowkey,kv));
            }
        }
    }

    /**
     * 本地或测试环境使用
     * @param conf Configuration
     */
    private static void devHeader(Configuration conf){
        // 本地测试提交到测试集群
        conf.set("mapreduce.app-submission.cross-platform", "true");
        conf.set("mapreduce.job.ubertask.enable", "true");
        conf.set("fs.defaultFS","hdfs://2.2.2.2:8020");
        // 支持hdfs下目录含子目录
        conf.set("mapreduce.input.fileinputformat.input.dir.recursive", "true");
        System.setProperty("hadoop.home.dir", "D:\\soft\\developsoft\\Hadoop\\hadoop-2.6.5");
        System.setProperty("HADOOP_USER_NAME", "hdfs");
    }
}

来，pom也抛上来

 <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <maven.compiler.source>1.8</maven.compiler.source>
        <maven.compiler.target>1.8</maven.compiler.target>
        <logback.version>1.1.5</logback.version>
    </properties>

    <dependencies>
        <dependency>
            <groupId>org.apache.kafka</groupId>
            <artifactId>kafka_2.11</artifactId>
            <version>1.0.1</version>
            <exclusions>
                <exclusion>
                    <groupId>org.slf4j</groupId>
                    <artifactId>slf4j-log4j12</artifactId>
                </exclusion>
                <exclusion>
                    <artifactId>jmxri</artifactId>
                    <groupId>com.sun.jmx</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>jmxtools</artifactId>
                    <groupId>com.sun.jdmk</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>jms</artifactId>
                    <groupId>javax.jms</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>junit</artifactId>
                    <groupId>junit</groupId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>com.typesafe</groupId>
            <artifactId>config</artifactId>
            <version>1.2.1</version>
        </dependency>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>3.8.1</version>
            <scope>test</scope>
        </dependency>
        <!--******** logback and slf4j ******** -->
        <dependency>
            <groupId>ch.qos.logback</groupId>
            <artifactId>logback-core</artifactId>
            <version>${logback.version}</version>
        </dependency>
        <dependency>
            <groupId>ch.qos.logback</groupId>
            <artifactId>logback-classic</artifactId>
            <version>${logback.version}</version>
        </dependency>

        <dependency>
            <groupId>ch.qos.logback</groupId>
            <artifactId>logback-access</artifactId>
            <version>${logback.version}</version>
        </dependency>
        <dependency>
            <groupId>commons-codec</groupId>
            <artifactId>commons-codec</artifactId>
            <version>RELEASE</version>
        </dependency>
    </dependencies>

    <build>
        <plugins>
            <plugin>
                <groupId>net.alchim31.maven</groupId>
                <artifactId>scala-maven-plugin</artifactId>
                <version>3.2.2</version>
                <executions>
                    <execution>
                        <goals>
                            <goal>compile</goal>
                            <goal>testCompile</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-shade-plugin</artifactId>
                <version>2.4.3</version>
                <executions>
                    <execution>
                        <phase>package</phase>
                        <goals>
                            <goal>shade</goal>
                        </goals>
                        <configuration>
                            <transformers>
                                <transformer
                                        implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
                                    <mainClass>com.geotmt.dw.Application</mainClass>
                                </transformer>
                            </transformers>
                        </configuration>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>
</project>

写在最后：

Flink还是蛮新的技术，慢慢看着他长大吧。