Flink生成Hfile

提出需求:

       团队为了统一技术栈,一致同意用Flink来进行对批和流计算统一处理。

 

问题来了:

      Flink似乎相对spark来说还是很不完善,spark分分钟搞定的事情,在Flink里就需要动动脑子了。

 

参考spark;

object CreateHfile {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("CreateHfile").setMaster(args(0))
    val sc = new SparkContext(conf)

    val hbaseConf = HBaseConfiguration.create()
    //
    val rdd = sc.textFile(args(1))
        .flatMap(v =>{
                val x = new javaList[String]()
                for( a <- 1 to 9999){
                        x.add(v + "%04d".format(a))
                }
                x.toArray
        })
        .sortBy(v=>v.toString)
        .map(r =>(new ImmutableBytesWritable(Bytes.toBytes(r.toString)),
            new KeyValue(Bytes.toBytes(r.toString), Bytes.toBytes("phoneFamliy"), Bytes.toBytes("phoneCol"),System.currentTimeMillis(),KeyValue.Type.DeleteColumn)))

    rdd.saveAsNewAPIHadoopFile(args(2), classOf[ImmutableBytesWritable],classOf[KeyValue],classOf[HFileOutputFormat2], hbaseConf)
    sc.stop()

  }

}

再来看看MR

public class HFileCreateJob {
    private final static Logger log = LoggerFactory.getLogger(HFileCreateJob.class);

    public void run(String input,String output,String env) throws Exception {

        Configuration conf = new Configuration();
        if("dev".equals(env)){
            devHeader(conf) ;
        }

        try {
            // 运行前,删除已存在的中间输出目录
            try {
                FileSystem fs = FileSystem.get(URI.create(output), conf);
                fs.delete(new Path(output), true);
                fs.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
            Job job = Job.getInstance(conf, "HFileCreateJob");
            job.setJobName("Zhao@HFileCreateJob_V1.0");
            job.setJarByClass(HFileCreateJob.class);
            job.setMapperClass(HfileMapper.class);
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(Text.class);

            job.setReducerClass(HfileReducer.class);
            job.setOutputKeyClass(ImmutableBytesWritable.class);
            job.setOutputValueClass(KeyValue.class);
            FileInputFormat.addInputPath(job, new Path(input));
            FileOutputFormat.setOutputPath(job, new Path(output));
            job.setOutputFormatClass(HFileOutputFormat2.class);
            System.exit(job.waitForCompletion(true) ? 0 : 1);

        } catch (IOException e) {
            e.printStackTrace();
        }

    }

    private void devHeader(Configuration conf){
        // 本地测试提交到测试集群
        conf.set("mapreduce.app-submission.cross-platform", "true");
        conf.set("mapreduce.job.ubertask.enable", "true");
        conf.set("fs.defaultFS","hdfs://10.10.10.165:8020");
        conf.set("mapreduce.job.jar","E:\\intermult-hbase\\target\\intermulthbase-1.0-SNAPSHOT.jar");
        // 支持hdfs下目录含子目录
        conf.set("mapreduce.input.fileinputformat.input.dir.recursive", "true");
        System.setProperty("hadoop.home.dir", "D:\\soft\\developsoft\\Hadoop\\hadoop-2.6.5");
        System.setProperty("HADOOP_USER_NAME", "hdfs");
    }

public class HfileMapper extends Mapper<LongWritable, Text, Text, Text> {
    private String rowKeySalt =  ConfigFactory.load().getConfig("hfileCreate").getString("rowKeySalt") ;

    @Override
    protected void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException {
        String[] datas = value.toString().split("\\001");
        String content = value.toString().replaceAll("\\001","\\!\\@\\#\\$") ;
        Text rowKey = new Text(SHA256Util.getSHA256Str(datas[0] + rowKeySalt )) ;
        context.write(rowKey,new Text(content));
    }
}

public class HfileReducer extends Reducer<Text, Text, ImmutableBytesWritable, KeyValue> {
    private final static Logger logger = LoggerFactory.getLogger(HFileCreateJob.class);
    private Config env =  ConfigFactory.load().getConfig("hfileCreate") ;
    private String family =  env.getString("family") ;
    private String column=  env.getString("column") ;

    @Override
    protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, ImmutableBytesWritable, KeyValue>.Context context)
            throws IOException, InterruptedException {

        for (Text value : values) {
            try{
                String line = value.toString();
                logger.error("line : " + line);
                ImmutableBytesWritable rowkey = new ImmutableBytesWritable(key.toString().getBytes());
                KeyValue kv = new KeyValue(key.toString().getBytes(), this.family.getBytes(), column.getBytes() , line.getBytes());
                context.write(rowkey, kv);

            }catch (Exception e){
                logger.error("",e);
                e.printStackTrace();
            }

        }
    }
}

最后是Flink的方案:

import org.apache.flink.api.common.functions.RichGroupReduceFunction;
import org.apache.flink.api.common.operators.Order;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.hadoop.mapreduce.HadoopInputFormat;
import org.apache.flink.api.java.hadoop.mapreduce.HadoopOutputFormat;
import org.apache.flink.api.java.tuple.Tuple1;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;

/**
 * 用Flink生产Hfile
 * https://ci.apache.org/projects/flink/flink-docs-release-1.7/dev/batch/hadoop_compatibility.html
 * Created by geo on 2019/4/8. */
public class Application {

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        devHeader(conf);
        Job job = Job.getInstance(conf);

        // HDFS 输入
        HadoopInputFormat<LongWritable, Text> hadoopIF =
                new HadoopInputFormat<LongWritable, Text>(
                        new TextInputFormat(), LongWritable.class, Text.class, job
                );
        TextInputFormat.addInputPath(job, new Path("hdfs://2.2.2.2:8020/user/zhao/out0226/testHfile"));


        // Flink就干了这点事
        ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
        DataSet<Tuple2<LongWritable, Text>> textDataSet = env.createInput(hadoopIF);
        DataSet<Tuple2<ImmutableBytesWritable,Cell>> ds =  textDataSet.map(v-> Tuple1.of(v.f1.toString()))
                .returns(Types.TUPLE(Types.STRING))
                .groupBy(0)
                .sortGroup(0,Order.ASCENDING)
                .reduceGroup(new createHfile());

        // 设置输出类型
        job.setOutputKeyClass(ImmutableBytesWritable.class);
        job.setOutputValueClass(KeyValue.class);

        // 输出到HDFS
        HadoopOutputFormat<ImmutableBytesWritable, Cell> hadoopOF =
                new HadoopOutputFormat<ImmutableBytesWritable, Cell>(
                        new HFileOutputFormat2(), job
                );
        HFileOutputFormat2.setOutputPath(job, new Path("hdfs://10.111.32.165:8020/user/zhao/out0226/9/"));
        job.setOutputFormatClass(HFileOutputFormat2.class);
        ds.output(hadoopOF);
        env.execute();
    }

    // 生产 Tuple2<ImmutableBytesWritable,Cell>
    public static final class createHfile extends RichGroupReduceFunction<Tuple1<String>, Tuple2<ImmutableBytesWritable,Cell>> {

        @Override
        public void reduce(Iterable<Tuple1<String>> values, Collector<Tuple2<ImmutableBytesWritable, Cell>> out) throws Exception {
            String family="datasfamily";
            String column="content";
            for (Tuple1<String> key:values) {
                ImmutableBytesWritable rowkey = new ImmutableBytesWritable(key.toString().getBytes());
                KeyValue kv = new KeyValue(key.toString().getBytes(), family.getBytes(), column.getBytes() , key.f0.getBytes());
                out.collect(Tuple2.of(rowkey,kv));
            }
        }
    }

    /**
     * 本地或测试环境使用
     * @param conf Configuration
     */
    private static void devHeader(Configuration conf){
        // 本地测试提交到测试集群
        conf.set("mapreduce.app-submission.cross-platform", "true");
        conf.set("mapreduce.job.ubertask.enable", "true");
        conf.set("fs.defaultFS","hdfs://2.2.2.2:8020");
        // 支持hdfs下目录含子目录
        conf.set("mapreduce.input.fileinputformat.input.dir.recursive", "true");
        System.setProperty("hadoop.home.dir", "D:\\soft\\developsoft\\Hadoop\\hadoop-2.6.5");
        System.setProperty("HADOOP_USER_NAME", "hdfs");
    }
}

来,pom也抛上来

 <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <maven.compiler.source>1.8</maven.compiler.source>
        <maven.compiler.target>1.8</maven.compiler.target>
        <logback.version>1.1.5</logback.version>
    </properties>

    <dependencies>
        <dependency>
            <groupId>org.apache.kafka</groupId>
            <artifactId>kafka_2.11</artifactId>
            <version>1.0.1</version>
            <exclusions>
                <exclusion>
                    <groupId>org.slf4j</groupId>
                    <artifactId>slf4j-log4j12</artifactId>
                </exclusion>
                <exclusion>
                    <artifactId>jmxri</artifactId>
                    <groupId>com.sun.jmx</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>jmxtools</artifactId>
                    <groupId>com.sun.jdmk</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>jms</artifactId>
                    <groupId>javax.jms</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>junit</artifactId>
                    <groupId>junit</groupId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>com.typesafe</groupId>
            <artifactId>config</artifactId>
            <version>1.2.1</version>
        </dependency>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>3.8.1</version>
            <scope>test</scope>
        </dependency>
        <!--******** logback and slf4j ******** -->
        <dependency>
            <groupId>ch.qos.logback</groupId>
            <artifactId>logback-core</artifactId>
            <version>${logback.version}</version>
        </dependency>
        <dependency>
            <groupId>ch.qos.logback</groupId>
            <artifactId>logback-classic</artifactId>
            <version>${logback.version}</version>
        </dependency>

        <dependency>
            <groupId>ch.qos.logback</groupId>
            <artifactId>logback-access</artifactId>
            <version>${logback.version}</version>
        </dependency>
        <dependency>
            <groupId>commons-codec</groupId>
            <artifactId>commons-codec</artifactId>
            <version>RELEASE</version>
        </dependency>
    </dependencies>

    <build>
        <plugins>
            <plugin>
                <groupId>net.alchim31.maven</groupId>
                <artifactId>scala-maven-plugin</artifactId>
                <version>3.2.2</version>
                <executions>
                    <execution>
                        <goals>
                            <goal>compile</goal>
                            <goal>testCompile</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-shade-plugin</artifactId>
                <version>2.4.3</version>
                <executions>
                    <execution>
                        <phase>package</phase>
                        <goals>
                            <goal>shade</goal>
                        </goals>
                        <configuration>
                            <transformers>
                                <transformer
                                        implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
                                    <mainClass>com.geotmt.dw.Application</mainClass>
                                </transformer>
                            </transformers>
                        </configuration>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>
</project>

写在最后:

 

    Flink还是蛮新的技术,慢慢看着他长大吧。

在 Apache Flink 中,可以使用自定义的 WatermarkGenerator 来生成水位线。水位线用于衡量事件时间进展,帮助确定何时触发窗口计算。 首先,你需要实现 WatermarkGenerator 接口,并覆盖它的两个方法:getCurrentWatermark 和 onEvent。 ```java import org.apache.flink.streaming.api.functions.AssignerWithPunctuatedWatermarks; import org.apache.flink.streaming.api.watermark.Watermark; public class CustomWatermarkGenerator implements AssignerWithPunctuatedWatermarks<Event> { @Override public long extractTimestamp(Event event, long previousTimestamp) { return event.getTimestamp(); } @Override public Watermark checkAndGetNextWatermark(Event lastElement, long extractedTimestamp) { // 在这里根据需要实现水位线的生成逻辑 // 返回一个 Watermark 对象,表示当前的水位线 // 可以使用事件中的时间戳进行计算 return new Watermark(extractedTimestamp - 5000); // 示例:设置水位线为事件时间减去 5 秒 } } ``` 然后,将自定义的 WatermarkGenerator 应用到你的 Flink 程序中: ```java DataStream<Event> events = ...; // 输入事件流 // 应用水位线生成器 DataStream<Event> eventsWithWatermarks = events.assignTimestampsAndWatermarks(new CustomWatermarkGenerator()); ``` 通过调用 `assignTimestampsAndWatermarks` 方法,并传入自定义的 WatermarkGenerator,即可将水位线应用到事件流上。 请注意,在 `CustomWatermarkGenerator` 中,`extractTimestamp` 方法用于从事件中提取时间戳,用于生成水位线。`checkAndGetNextWatermark` 方法在每个事件到达时被调用,可以根据事件的时间戳计算出水位线。示例中的水位线设置为事件时间减去 5 秒,你可以根据实际需求来实现水位线的生成逻辑。
评论 9
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值