Flink基础环境
前提条件
- JDK1.8+安装完成
- HDFS正常启动(SSH免密认证)
Flink安装
- 上传并解压flink
[root@centos ~]# tar -zxf flink-1.8.1-bin-scala_2.11.tgz -C /usr/
- 配置flink-conf.yaml
[root@centos ~]# vi /usr/flink-1.8.1/conf/flink-conf.yaml
jobmanager.rpc.address: centos
taskmanager.numberOfTaskSlots: 3
parallelism.default:
- 配置slaves
[root@centos ~]# vi /usr/flink-1.8.1/conf/slaves
centos
- 启动Flink
[root@centos flink-1.8.1]# ./bin/start-cluster.sh
Starting cluster.
Starting standalonesession daemon on host centos.
Starting taskexecutor daemon on host centos.
[root@centos flink-1.8.1]# jps
2912 Jps
2841 TaskManagerRunner
2397 StandaloneSessionClusterEntrypoint
访问:http://centos:8081
依赖
<properties>
<flink.version>1.8.1</flink.version>
<scala.version>2.11</scala.version>
</properties>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-scala_${scala.version}</artifactId>
<version>${flink.version}</version>
</dependency>
简单案例
package flink
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
object FlinkWordCount {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
val dataStream: DataStream[String] = env.socketTextStream("centos",4444)
import org.apache.flink.streaming.api.scala._
dataStream
.flatMap(_.split("\\s+"))
.map((_,1))
.keyBy(_._1)
.sum(1)
.print
env.execute("flink word count")
}
}
关于Flink的source以及sink的自定义使用
简单的单词计数
source:kafka
sink:redis
package flink
import java.util.Properties
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
import org.apache.flink.streaming.connectors.kafka.{FlinkKafkaConsumer, KafkaDeserializationSchema}
import org.apache.flink.streaming.connectors.redis.RedisSink
import org.apache.flink.streaming.connectors.redis.common.config.FlinkJedisPoolConfig
import org.apache.flink.streaming.connectors.redis.common.mapper.{RedisCommand, RedisCommandDescription, RedisMapper}
import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord}
object KafkaSourcesAndSink {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
//kafka consumer config
val props=new Properties()
props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG,"cluster_path")
props.put(ConsumerConfig.GROUP_ID_CONFIG,"group_id")
//redis config
val config = new FlinkJedisPoolConfig.Builder().setHost("Spark").setPort(6379).build()
//generic of received dataSet due to deserializer
val flinkKafkaConsumer = new FlinkKafkaConsumer[(Int,Long,String,String,String)]("TopicFlink",new MyKafkaFlinkDeserialization,props)
import org.apache.flink.api.scala._
val dataStream: DataStream[(Int,Long,String,String,String)] = env.addSource(flinkKafkaConsumer)
dataStream
.setParallelism(3)
.map(t=>t._5)
.flatMap(_.split("\\s+"))
.map((_,1))
.keyBy(0)
.sum(1) //the generic of result of sum
.addSink(new RedisSink[(String, Int)](config,new MyRedisMapper))
env.execute
}
} //type of produced data which consumer extracts and need to be deserialize
class MyKafkaFlinkDeserialization extends KafkaDeserializationSchema[(Int,Long,String,String,String)] {
override def isEndOfStream(t: (Int, Long, String, String, String)): Boolean = false
override def deserialize(consumerRecord: ConsumerRecord[Array[Byte], Array[Byte]]): (Int, Long, String, String, String) = {
if(consumerRecord.key()!=null)
(consumerRecord.partition(),consumerRecord.offset(),consumerRecord.topic(),consumerRecord.key().toString,new String(consumerRecord.value()))
else //必须使用new String (),toString打印的是地址
(consumerRecord.partition(),consumerRecord.offset(),consumerRecord.topic(),"",new String(consumerRecord.value()))
}
override def getProducedType: TypeInformation[(Int, Long, String, String, String)] = {
import org.apache.flink.streaming.api.scala._
//type of produced data which consumer extracts
createTypeInformation[(Int, Long, String, String, String)]
}
}
class MyRedisMapper extends RedisMapper[(String,Int)]{
override def getCommandDescription: RedisCommandDescription = {
//the type of storage into redis //give a big key due to using 'HSET' redis command
new RedisCommandDescription(RedisCommand.HSET,"word_count")
}
override def getKeyFromData(t: (String, Int)): String = {
t._1
}
override def getValueFromData(t: (String, Int)): String = {
t._2.toString
}
}