Flink流处理的Source
- 基于集合
- 基于文件
- 基于Socket
- 使用Kafka作为数据源
- 使用MySql作为数据源
- 自定义数据源
flink基于文件流或集合
object TextStream {
def main(args: Array[String]): Unit = {
val environment: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
environment.setParallelism(1)
val textDataStream: DataStream[String] = environment.readTextFile("D://work//train_data//word.txt")
//val textDataStream: DataStream[String] = environment.fromElements("hadoop hive spark", "hive spark hive", "hadoop hbase")
textDataStream.flatMap(_.split(" "))
.map((_,1))
.keyBy(0)
.sum(1)
.print()
environment.execute("StreamByText")
}
}
flink对接socket流
node01 安装nc yum install nc -y
开启监听窗口: nc -l 8989
object SocketStream {
def main(args: Array[String]): Unit = {
val environment = StreamExecutionEnvironment.getExecutionEnvironment
// 设置并行度
environment.setParallelism(1)
// 对接socket
val socketDataStream: DataStream[String] = environment.socketTextStream("node01", 8989, '\n')
socketDataStream.flatMap(_.split(" "))
.map((_,1))
.keyBy(0)
.sum(1)
.print()
environment.execute("WordCountBySocketStream")
}
}
在nc监听窗口中发送数据观察程序输出
flink对接kafka
创建kafka topic
./kafka-topics.sh --zookeeper localhost:2181 --create --topic xxy --partitions 2 --replication-factor
打开kafka生产者命令行窗口
./kafka-console-producer.sh --broker-list node01:9092 --topic xxy
object KafkaStream {
def main(args: Array[String]): Unit = {
val environment: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
val properties = new Properties()
properties.put("zookeeper.connect", "node01:2181,node02:2181,node03:2181")
properties.put("bootstrap.servers","node01:9092,node02:9092,node03:9092")
properties.put("group.id","xxy")
val kafkaSource: DataStream[String] = environment.addSource(new FlinkKafkaConsumer010[String]("xxy", new SimpleStringSchema(), properties))
kafkaSource.flatMap(_.split(" "))
.map((_,1))
.keyBy(0)
.sum(1)
.print()
environment.execute("StreamByKafka")
}
}
自定义数据mysql数据源
object MysqlStream {
// 1.自定义Source,继承自RichSourceFunction
class MysqlSource extends RichSourceFunction[(Int,String,Int)] {
var connection: Connection = null
var statement: PreparedStatement = null
// 实现run方法加载驱动,创建链接,创建preparedStatement,执行查询,遍历查询结果,收集数据
override def run(sourceContext: SourceFunction.SourceContext[(Int, String, Int)]): Unit = {
Class.forName("com.mysql.jdbc.Driver")
connection = DriverManager.getConnection("jdbc:mysql://node01:3306/xxy", "root", "123456")
val sql = "select id,name,age from student"
statement = connection.prepareStatement(sql)
val result: ResultSet = statement.executeQuery()
while (result.next()) {
val id = result.getInt("id")
val name = result.getString("name")
val age = result.getInt("age")
sourceContext.collect((id,name,age))
}
}
override def cancel(): Unit = ???
override def close(): Unit = {
if (connection!=null) {
connection.close()
}
if (statement!=null) {
statement.close()
}
}
}
def main(args: Array[String]): Unit = {
val environment: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
val mysqlSource: DataStream[(Int, String, Int)] = environment.addSource(new MysqlSource)
mysqlSource.print()
environment.execute("StreamByMysql")
}
}
Transformation 算子
keyBy
:按照指定的key来进行分流,类似于批处理中的groupBy
。可以按照索引名/字段名来指定分组的字段.
connect:
用来将两个DataStream组装成一个ConnectedStreams
。它用了两个泛型,即不要求两个dataStream的element是同一类型。这样我们就可以把不同的数据组装成同一个结构.
split
就是将一个DataStream分成多个流,用SplitStream
来表示
DataStream → SplitStream
select
就是获取分流后对应的数据,跟split搭配使用,从SplitStream中选择一个或多个流
flink 在流处理上常见的sink
sink到kafka
object KafkaSink {
def main(args: Array[String]): Unit = {
val environment: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
val dataSource: DataStream[String] = environment.fromCollection(Array((4,"小明",19),(5,"小红",20),(6,"小米",88)))
.map(k=>k._1.toString+k._2+k._3.toString)
val properties = new Properties()
properties.put("zookeeper.connect", "node01:2181,node02:2181,node03:2181")
properties.put("bootstrap.servers","node01:9092,node02:9092,node03:9092")
properties.put("group.id","xxy")
dataSource.addSink(new FlinkKafkaProducer010[String]("xxy",new SimpleStringSchema(),properties))
environment.execute("StreamSinkKafka")
}
}
sink到mysql
class MysqlSink extends RichSinkFunction[(String,Int)](){
var connection:Connection = null
var statement:PreparedStatement = null
override def open(parameters: Configuration): Unit = {
// 加载mysql
Class.forName("com.mysql.jdbc.Driver")
connection = DriverManager.getConnection("jdbc:mysql://node01:3306/xxy??useUnicode=true&characterEncoding=UTF-8", "root", "123456")
val sql = "insert into student (name,age) values (?,?)"
statement = connection.prepareStatement(sql)
}
override def invoke(value: (String, Int)): Unit = {
statement.setString(1,value._1)
statement.setInt(2,value._2)
statement.executeUpdate()
}
override def close(): Unit = {
if (connection!=null){
connection.close()
}
if (statement!=null) {
statement.close()
}
}
}
object MysqlSink {
def main(args: Array[String]): Unit = {
val environment: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
val dataStream: DataStream[(String, Int)] = environment.fromCollection(Array(("小明", 30), ("小红", 20), ("小米", 16)))
dataStream.addSink(new MysqlSink)
environment.execute("StreamSinkMysql")
}
}