Flink读取Netty数据示例代码

2 篇文章 0 订阅

本示例代码记录每张表数据的爬取进度

  1. 每张表实时ID
  2. 每张表实时爬取数量
  3. 记录每张表记录总数
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;

import java.io.Serializable;

/**
 * 爬取的当前信息
 *
 * @author songjianyong
 */
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
public class SpiderCurrentInfo implements Serializable {
    private static final long serialVersionUID = 1L;

    /**
     * 表名
     */
    private String table;

    /**
     * 当前爬取完的最大主键
     */
    private Integer primaryKey;

    /**
     * 表记录总数或需要爬取的总数
     */
    private Integer total;

    /**
     * 已爬取完的记录总数
     */
    private Integer finishNum;

    /**
     * 是否爬取完毕
     */
    private Boolean finish;
}

一、引入依赖

            <!--flink核心包-->
            <dependency>
                <groupId>org.apache.flink</groupId>
                <artifactId>flink-java</artifactId>
                <version>1.15.0</version>
            </dependency>
            <!--flink流处理包-->
            <dependency>
                <groupId>org.apache.flink</groupId>
                <artifactId>flink-streaming-java</artifactId>
                <version>1.15.0</version>
            </dependency>
            <dependency>
                <groupId>org.apache.flink</groupId>
                <artifactId>flink-clients</artifactId>
                <version>1.15.0</version>
            </dependency>

二、Netty服务端代码

    Map<String, Channel> CHANNEL_MAP = new HashMap<>(4);    

    /**
     * 绑定端口服务
     */
    private void bindNetty() {
        Thread nettySpiderProgressRate = new Thread(() -> {
            try {
                bind();
            } catch (InterruptedException e) {
                log.error(e.getMessage(), e);
                throw new RuntimeException(e);
            }
        });
        nettySpiderProgressRate.setName("netty-spider-progress-rate");
        nettySpiderProgressRate.start();
    }

    void bind() throws InterruptedException {
        /* 配置服务端的NIO线程组 */
        EventLoopGroup bossGroup = new NioEventLoopGroup();
        EventLoopGroup workerGroup = new NioEventLoopGroup();

        try {
            ServerBootstrap b = new ServerBootstrap();

            b.group(bossGroup, workerGroup).channel(NioServerSocketChannel.class).option(ChannelOption.SO_BACKLOG, 1024)
                    .childHandler(new ChildChannelHandler());

            ChannelFuture f = b.bind(bindPort).sync();
            f.channel().closeFuture().sync();
        } finally {
            bossGroup.shutdownGracefully();
            workerGroup.shutdownGracefully();
        }

    }

    static class ChildChannelHandler extends ChannelInitializer<SocketChannel> {

        @Override
        protected void initChannel(SocketChannel ch) {

            ChannelPipeline channelPipeline = ch.pipeline();
            if (SystemUtils.IS_OS_WINDOWS) {
                channelPipeline.addLast(new LoggingHandler(LogLevel.INFO));
            }
            channelPipeline.addLast(new ReadTimeoutHandler(Long.MAX_VALUE, TimeUnit.SECONDS));
            channelPipeline.addLast(new WriteTimeoutHandler(Long.MAX_VALUE, TimeUnit.SECONDS));
            channelPipeline.addLast(new LineBasedFrameDecoder(1024));
            channelPipeline.addLast(new StringDecoder(StandardCharsets.UTF_8));
            channelPipeline.addLast(new SpiderServerHandler());

        }
    }

    static class SpiderServerHandler extends ChannelInboundHandlerAdapter {
        @Override
        public void channelActive(ChannelHandlerContext ctx) {
            String id = ctx.channel().id().asLongText();
            CHANNEL_MAP.put(id, ctx.channel());
            log.warn("收到连接:{}", id);
        }

        @Override
        public void exceptionCaught(ChannelHandlerContext ctx, Throwable e) {
            log.error(e.getMessage(), e);
        }
    }

注意:netty发现30秒后未读取到数据会自动断开Flink的Socket连接,此处默认时间调整为:

Long.MAX_VALUE, TimeUnit.SECONDS

另一种方式就是设置.option(ChannelOption.SO_KEEPALIVE, true)

三、Flink监听代码

    void flink() {
        StreamExecutionEnvironment streamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment();
        DataStreamSource<String> textStream = streamExecutionEnvironment.socketTextStream("localhost", bindPort, System.lineSeparator());

        SingleOutputStreamOperator<SpiderCurrentInfo> operator = textStream.flatMap(new SpiderFlatMap());

        SingleOutputStreamOperator<SpiderCurrentInfo> reduce = operator.keyBy((KeySelector<SpiderCurrentInfo, String>) SpiderCurrentInfo::getTable).
                reduce(new SpiderReduce());

        reduce.addSink(new SpiderSink());

        Thread progressRate = new Thread(() -> {
            try {
                streamExecutionEnvironment.execute("spider progress rate");
            } catch (Exception e) {
                log.error(e.getMessage(), e);
                throw new RuntimeException(e);
            }
        });
        progressRate.setName("fink-spider-progress-rate");
        progressRate.start();
    }

1、Flink之Reduce处理示例代码

import org.apache.flink.annotation.Public;
import org.apache.flink.api.common.functions.ReduceFunction;

import java.util.Objects;

/**
 * 爬取评分进度Reduce处理
 *
 * @author songjianyong
 */
@Public
public class SpiderReduce implements ReduceFunction<SpiderCurrentInfo> {
    private static final long serialVersionUID = 1L;

    @Override
    public SpiderCurrentInfo reduce(SpiderCurrentInfo v1, SpiderCurrentInfo v2) {
        boolean finish = Objects.equals(v1.getFinish(), true) || Objects.equals(v2.getFinish(), true);
        return SpiderCurrentInfo.builder()
                .table(v1.getTable())
                .primaryKey(v1.getPrimaryKey() >= v2.getPrimaryKey() ? v1.getPrimaryKey() : v2.getPrimaryKey())
                .total(v1.getTotal())
                .finish(finish)
                .finishNum(v1.getFinishNum() >= v2.getFinishNum() ? v1.getFinishNum() + 1 : v2.getFinishNum() + 1)
                .build();
    }
}

2、Flink自定义数据扁平化处理示例代码

import com.alibaba.fastjson.JSON;
import org.apache.commons.lang3.StringUtils;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.tuple.Tuple4;
import org.apache.flink.util.Collector;

/**
 * 数据扁平化处理
 *
 * @author songjianyong
 */
public class SpiderFlatMap implements FlatMapFunction<String, SpiderCurrentInfo> {
    private static final long serialVersionUID = 1L;

    @Override
    public void flatMap(String value, Collector<SpiderCurrentInfo> out) {
        SpiderCurrentInfo info = JSON.parseObject(value, SpiderCurrentInfo.class);
        info.setFinishNum(0);
        out.collect(info);
    }
}

3、Flink数据下沉落地示例代码

import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.tuple.Triple;
import org.apache.flink.annotation.Public;
import org.apache.flink.api.java.tuple.Tuple4;
import org.apache.flink.streaming.api.functions.sink.SinkFunction;

import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Objects;

import static com.song.spider.imdb.constant.Constants.ALL_SPIDER_PROCESS;
import static com.song.spider.imdb.service.impl.BaseService.TMP_DIR;

/**
 * 爬取进程沉淀
 *
 * @author songjianyong
 */
@Slf4j
@Public
public class SpiderSink implements SinkFunction<SpiderCurrentInfo> {
    private static final long serialVersionUID = 1L;

    @Override
    public void invoke(SpiderCurrentInfo value, Context context) {
        ALL_SPIDER_PROCESS.put(value.getTable(), Triple.of(value.getPrimaryKey(), value.getTotal(), value.getFinishNum()));

        Path path = Paths.get(TMP_DIR, value.getTable());
        try {
            if (Objects.equals(true, value.getFinish())) {
                ALL_SPIDER_PROCESS.remove(value.getTable());
                Files.deleteIfExists(path);
                return;
            }
            Files.write(path, value.getPrimaryKey().toString().getBytes(StandardCharsets.UTF_8));
        } catch (IOException e) {
            log.error(e.getMessage(), e);
            throw new RuntimeException(e);
        }
    }
}

四、整合启动

    @Override
    public void run(String... args) {
        bindNetty();
        flink();
    }

启动netty在前,Flink监听启动在后!

netty写入数据示例代码

    /**
     * 发送数据到Flink
     *
     * @param info 发送信息
     */
    protected void sendToFlink(SpiderCurrentInfo info) {
        String v = JSON.toJSONString(info);
        for (Map.Entry<String, Channel> entry : CHANNEL_MAP.entrySet()) {
            Channel channel = entry.getValue();
            channel.writeAndFlush(Unpooled.copiedBuffer((v + System.lineSeparator()).getBytes(StandardCharsets.UTF_8)));
        }
    }

Flink简介https://www.kancloud.cn/zhangpn/flink/1717796

Apache Flink 文档https://nightlies.apache.org/flink/flink-docs-master/zh/

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值