chapter05

点击来源(ClickSource)

package com.liao.chapter05

import java.util.Calendar

import org.apache.flink.streaming.api.functions.source.SourceFunction
import org.apache.flink.streaming.api.watermark.Watermark

import scala.util.Random

class ClickSource extends SourceFunction[Event]{
  //标志位
  var running = true


  override def run(ctx: SourceFunction.SourceContext[Event]): Unit = {
    //随机数生成
    val random = new Random()

    //定义数据随机选择的范围
    var users = Array("Mary","Alice","Bob","Cary")
    var urls = Array("./home","./cart","./fav","./prod?id=1","./prod?id=2","./prod?id=3")

    //用标志位作为循环判断条件,不停地发出数据
    while(running){
      var event = Event(users(random.nextInt(users.length)),urls(random.nextInt(urls.length)), Calendar.getInstance.getTimeInMillis)
      //为要发送的数据分配时间戳
//      ctx.collectWithTimestamp(event,event.timestamp)
//
//      //向下游直接发送水位线
//      ctx.emitWatermark(new Watermark(event.timestamp-1L))

      //调用ctx地方法向下游发送数据
      ctx.collect(event)
      //每隔1s发送一条数据
      Thread.sleep(1000)

    }
  }

  override def cancel(): Unit = running = false
}

JdbcSinkTest

package com.liao.chapter05

import java.sql.{Connection, DriverManager, PreparedStatement}

import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.functions.sink.RichSinkFunction
import org.apache.flink.streaming.api.scala._

object JdbcSinkTest {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)

//    val stream :DataStream[Event]= env.fromElements(
//      Event("Mary", "./home", 1000L),
//      Event("Bob", "./cart", 2000L),
//      Event("Bob", "./cart", 3000L),
//      Event("Alice", "./cart", 3000L),
//      Event("Mary", "./prod?id=1", 4000L),
//      Event("Mary", "./prod?id=3", 6000L),
//      Event("Mary", "./prod?id=2", 5000L)
//    )

    val stream = env.addSource(new ClickSource)

    stream.addSink( new MyJdbcSinkFunc() )

    env.execute("jdbc sink test")

  }

  class MyJdbcSinkFunc() extends RichSinkFunction[Event]{
    //定义连接、预编译语句
    var conn: Connection = _
    var insertStmt: PreparedStatement = _
    var updateStmt: PreparedStatement = _

    override def open(parameters: Configuration): Unit = {
      conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/test?useSSL=false","root","1234")
      insertStmt = conn.prepareStatement("insert into event (user,url) values(?,?) ")
      updateStmt = conn.prepareStatement("update event set url = ? where user =? ")

    }

    override def invoke(value: Event): Unit = {
      //先执行更新操作,查到就更新
      updateStmt.setString(1,value.url)
      updateStmt.setString(2,value.user)
      updateStmt.execute()
      //如果更新没有查到数据,那么就插入
      if( updateStmt.getUpdateCount == 0 ){
        insertStmt.setString(1,value.user)
        insertStmt.setString(2,value.url)
        insertStmt.execute()
      }
    }

    override def close(): Unit = {
      insertStmt.close()
      updateStmt.close()
      conn.close()
    }
  }


}

分区广播测试

(PartitionBroadcastTest)

package com.liao.chapter05

import org.apache.flink.streaming.api.scala._

object PartitionBroadcastTest {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)

    //读取自定义的数据源
    val stream = env.addSource(new ClickSource)

    //轮询重分区后打印输出
    stream.broadcast.print("broadcast").setParallelism(4)


    stream.global.print("global").setParallelism(4)

    env.execute()
  }
}

分区客户测试

(PartitionCustomTest)

package com.liao.chapter05

import org.apache.flink.api.common.functions.Partitioner
import org.apache.flink.streaming.api.scala._

object PartitionCustomTest {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)

    //读取自定义的数据源
    val stream = env.fromElements(1,2,3,4,5,6,7,8)

    //自定义重分区策略
    stream.partitionCustom(new Partitioner[Int]{
      override def partition(k: Int ,i: Int): Int = {
        k % 2
      }
    } , data => data)
      .print("rebalance").setParallelism(4)

    env.execute()
  }
}

分区兼容性测试

(PartitionReblanceTest)

package com.liao.chapter05

import org.apache.flink.streaming.api.scala._

object PartitionReblanceTest {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)

    //读取自定义的数据源
    val stream = env.addSource(new ClickSource)

    //轮询重分区后打印输出
    stream.rebalance.print("rebalance").setParallelism(4)

    env.execute()
  }
}

分区重新缩放测试(PartitionRescaleTest)

package com.liao.chapter05

import org.apache.flink.streaming.api.functions.source.{RichParallelSourceFunction, SourceFunction}
import org.apache.flink.streaming.api.scala._

object PartitionRescaleTest {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)

    //读取自定义的数据源
    val stream = env.addSource(new RichParallelSourceFunction[Int] {
      override def run(ctx: SourceFunction.SourceContext[Int]): Unit = {
        for (i <- 0 to 7){
          //利用运行时上下文中的subTask的id信息,来控制数据由哪个并行子任务生成
          if (getRuntimeContext.getIndexOfThisSubtask == (i + 1) % 2)
          ctx.collect( i + 1)
        }
      }

      override def cancel(): Unit = ???
    }).setParallelism(2)

    //轮询重分区后打印输出
    stream.rescale.print("rescale").setParallelism(4)

    env.execute()
  }
}

分区随机测试(PartitionShuffleTest )

package com.liao.chapter05

import org.apache.flink.streaming.api.scala._

object PartitionShuffleTest {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)

    //读取自定义的数据源
    val stream = env.addSource(new ClickSource)

    //洗牌之后打印输出
    stream.shuffle.print("shuffle").setParallelism(4)

    env.execute()


  }
}

SinkToEsTest(水槽测试)

package com.liao.chapter05

import java.util

import org.apache.flink.api.common.functions.RuntimeContext
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.connectors.elasticsearch.{ElasticsearchSinkFunction, RequestIndexer}
import org.apache.flink.streaming.connectors.elasticsearch6.ElasticsearchSink
import org.apache.http.HttpHost
import org.elasticsearch.client.Requests

object SinkToEsTest {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)

    val stream :DataStream[Event]= env.fromElements(
      Event("Mary", "./home", 1000L),
      Event("Bob", "./cart", 2000L),
      Event("Bob", "./cart", 3000L),
      Event("Alice", "./cart", 3000L),
      Event("Mary", "./prod?id=1", 4000L),
      Event("Mary", "./prod?id=3", 6000L),
      Event("Mary", "./prod?id=2", 5000L)
    )

    //定义es集群地主机列表
    val httpHosts = new util.ArrayList[HttpHost]()
    httpHosts.add(new HttpHost("hadoop002", 9200))

    //定义一个ES sink function
    val esFun = new ElasticsearchSinkFunction[Event]() {
      override def process(t: Event, runtimeContext: RuntimeContext, requestIndexer: RequestIndexer): Unit = {
        val data = new util.HashMap[String, String]()
        data.put(t.user, t.url)

        //包装要发送地http请求
        val indexRequest = Requests.indexRequest()
          .index("clicks")
          .source(data)
          .`type`("event")

        //发送请求
        requestIndexer.add(indexRequest)
      }
    }

    stream.addSink( new ElasticsearchSink.Builder[Event](httpHosts,esFun).build() )

    env.execute()
  }
}

SinkToFileTest

(接收器到文件测试)

package com.liao.chapter05

import org.apache.flink.api.common.serialization.SimpleStringEncoder
import org.apache.flink.core.fs.Path
import org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink
import org.apache.flink.streaming.api.scala._

object SinkToFileTest {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(4)

    val stream :DataStream[Event]= env.fromElements(
      Event("Mary", "./home", 1000L),
      Event("Bob", "./cart", 2000L),
      Event("Bob", "./cart", 3000L),
      Event("Alice", "./cart", 3000L),
      Event("Mary", "./prod?id=1", 4000L),
      Event("Mary", "./prod?id=3", 6000L),
      Event("Mary", "./prod?id=2", 5000L)
    )

    var fileSink =  StreamingFileSink
      .forRowFormat(new Path("./output"), new SimpleStringEncoder[String]("UTF-8"))
        .build()

    //直接以文本形式分布式地写入到文件中
    stream.map(_.toString).addSink( fileSink )


    env.execute()
  }
}

SinkTokafkaTest

package com.liao.chapter05

import java.util.Properties

import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.connectors.kafka.{FlinkKafkaConsumer010, FlinkKafkaProducer011}

object SinkTokafkaTest {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)

    //读取文件数据
//    val stream = env.readTextFile("input/clicks.txt")


    var properties = new Properties()

    properties.setProperty("bootstrap.servers","hadoop002:9092")
    properties.setProperty("group.id","consumer-group")


    val stream:DataStream[String] = env.addSource(new FlinkKafkaConsumer010[String]("clicks", new SimpleStringSchema(), properties))
        .map(data=>{
          val fileds = data.split(",")
          Event(fileds(0).trim, fileds(1).trim , fileds(2).trim.toLong).toString
        })

    //将数据写入到kafka
    stream.addSink(new FlinkKafkaProducer011[String]("hadoop002:9092","events",new SimpleStringSchema()))

    env.execute()


}
}

SinkToRedisTest

package com.liao.chapter05

import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.connectors.redis.RedisSink
import org.apache.flink.streaming.connectors.redis.common.config.FlinkJedisPoolConfig
import org.apache.flink.streaming.connectors.redis.common.mapper.{RedisCommand, RedisCommandDescription, RedisMapper}

object SinkToRedisTest {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)

    val stream = env.addSource(new ClickSource)

    //创建一个Jedis连接地配置项
    var conf = new FlinkJedisPoolConfig.Builder()
      .setHost("hadoop002")
      .build()


    stream.addSink(new RedisSink[Event](conf,new MyRedisMapper))

    env.execute()

  }

  //实现RedisMapper接口
  class MyRedisMapper extends RedisMapper[Event] {
    override def getCommandDescription: RedisCommandDescription = new RedisCommandDescription(RedisCommand.HSET,"clicks")

    override def getKeyFromData(t: Event): String = t.user

    override def getValueFromData(t: Event): String = t.url
  }

}

源边界测试

(SourceBoundedTest)

package com.liao.chapter05

import org.apache.flink.streaming.api.scala._



case class Event(user:String ,url:String ,timestamp: Long)

object SourceBoundedTest {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)

    //1.从元素中读取数据
    val stream:DataStream[Int]= env.fromElements(1, 2, 3, 4, 5)

    val stream1:DataStream[Event] = env.fromElements(Event("Mary", "./home", 1000L),
      Event("Bob", "./cart", 2000L)
    )

    //2.从集合中读取数据
    val clicks = List(Event("Mary", "./home", 1000L),Event("Bob", "./cart", 2000L))
    val stream2:DataStream[Event] = env.fromCollection(clicks)

    //3.从文件中读取数据
    val stream3:DataStream[String] = env.readTextFile("input/clicks.txt")

    //打印输出
    stream.print("number")
    stream1.print("1")
    stream2.print("2")
    stream3.print("3")

    env.execute()
  }
}

源客户测试

(SourceCustomTest)

package com.liao.chapter05

import org.apache.flink.streaming.api.scala._

object SourceCustomTest {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)

    //读取自定义的数据源
    val stream:DataStream[Event] = env.addSource(new ClickSource)

    stream.print()

    env.execute()

  }
}

SourceKafkaTest

package com.liao.chapter05

import java.util.Properties

import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer010

object SourceKafkaTest {
  def main(args: Array[String]): Unit = {
    var env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)

    //用Properties保存Kafka连接的相关配置
    var properties = new Properties()

    properties.setProperty("bootstrap.servers","hadoop002:9092")
    properties.setProperty("group.id","consumer-group")


    val stream:DataStream[String] = env.addSource(new FlinkKafkaConsumer010[String]("clicks", new SimpleStringSchema(), properties))

    stream.print()

    env.execute()
  }
}

TransformAggTest

package com.liao.chapter05

import org.apache.flink.api.java.functions.KeySelector
import org.apache.flink.streaming.api.scala._

object TransformAggTest {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)

    val stream :DataStream[Event]= env.fromElements(Event("Mary", "./home", 1000L), Event("Bob", "./cart", 2000L),
      Event("Alice", "./cart", 3000L),
      Event("Mary", "./prod?id=1", 4000L),
      Event("Mary", "./prod?id=3", 6000L),
      Event("Mary", "./prod?id=2", 5000L)
    )
//    stream.keyBy(new MyKetSelector()).print()
    //    /*stream.keyBy( data =>data.user )*/
    //    stream.keyBy(_.user)
    //    stream.keyBy(_.url)
    stream.keyBy(new MyKetSelector())
        .maxBy("timestamp")
        .print()


    env.execute()

  }

  class MyKetSelector() extends KeySelector[Event,String]{
    override def getKey(in: Event): String = in.user
  }

}

TransformFilterTest

package com.liao.chapter05

import org.apache.flink.api.common.functions.FilterFunction
import org.apache.flink.streaming.api.scala._

object TransformFilterTest {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)

    val stream:DataStream[Event] = env.fromElements(Event("Mary", "./home", 1000L), Event("Bob", "./cart", 2000L))

    //过滤出用户为Mary的所有点击事件
    //1.使用匿名函数

    stream.filter( _.user == "Mary"  ).print("1")

    //实现FilterFunction接口
    stream.filter(new UserFilter).print("2")

    env.execute()

  }
  class UserFilter extends FilterFunction[Event]{
    override def filter(t: Event): Boolean = t.user == "Mary"
  }
}

TransformFlatmapTest

package com.liao.chapter05

import org.apache.flink.api.common.functions.FlatMapFunction
import org.apache.flink.streaming.api.scala._
import org.apache.flink.util.Collector

object TransformFlatmapTest {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)

    val stream :DataStream[Event]= env.fromElements(Event("Mary", "./home", 1000L), Event("Bob", "./cart", 2000L),
      Event("Alice", "./cart", 3000L))

    //测试灵活的输出形式
    stream.flatMap( new MyFlatMap ).print()

    env.execute()
  }

  //自定义实现FlatMapFunction
  class MyFlatMap extends FlatMapFunction[Event,String]{
    override def flatMap(t: Event, collector: Collector[String]): Unit ={
      //如果当前数据是Mary的点击事件,那么就直接输出user
      if(t.user == "Mary"){
        collector.collect(t.user)
      }
      //如果当前数据是Bob的点击事件,那么输出user和url
      else if(t.user == "Bob"){
        collector.collect(t.user)
        collector.collect(t.url)
      }
    }

  }
}

TransformMapTest

package com.liao.chapter05

import org.apache.flink.api.common.functions.MapFunction
import org.apache.flink.streaming.api.scala._

object TransformMapTest {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment

    env.setParallelism(1)


    val stream:DataStream[Event] =
    env.fromElements(Event("Mary","./home",1000L),
      Event("Bob", "./cart", 2000L)
    )

    //提取每次点击事件的用户名
    //1.使用匿名函数
    stream.map( _.user ).print("1")

    //2.实现MapFunction接口
    stream.map(new UserExtractor).print("2")


    env.execute()
  }

  class UserExtractor extends MapFunction[Event,String]{
    override def map(t: Event): String = t.user
  }

}

TransformReduceTest

package com.liao.chapter05

import org.apache.flink.api.common.functions.ReduceFunction
import org.apache.flink.streaming.api.scala._

object TransformReduceTest {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)

    val stream :DataStream[Event]= env.fromElements(
      Event("Mary", "./home", 1000L),
      Event("Bob", "./cart", 2000L),
      Event("Bob", "./cart", 3000L),
      Event("Alice", "./cart", 3000L),
      Event("Mary", "./prod?id=1", 4000L),
      Event("Mary", "./prod?id=3", 6000L),
      Event("Mary", "./prod?id=2", 5000L)
    )

    //reduce 归约聚合,提取当前最活跃的用户
    stream.map( data=>(data.user, 1L) )
      .keyBy( _._1 )
      .reduce(new MySum())    //统计每个用户的活跃度
      .keyBy(data => true)    //将所有数据按照同样的key分到同一个组中
      .reduce( (state,data) =>if(data._2 >= state._2) data else state )   //选取当前最活跃的用户
      .print()

    env.execute()

  }

  class MySum() extends ReduceFunction[(String,Long)]{
    override def reduce(t: (String, Long), t1: (String, Long)): (String, Long) = (t._1,t._2 + t1._2)

  }

}

TransformRichFunctionTest

package com.liao.chapter05

import org.apache.flink.api.common.functions.RichMapFunction
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.scala._

object  TransformRichFunctionTest {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(2)

    val stream :DataStream[Event]= env.fromElements(
      Event("Mary", "./home", 1000L),
      Event("Bob", "./cart", 2000L),
      Event("Bob", "./cart", 3000L),
      Event("Alice", "./cart", 3000L),
      Event("Mary", "./prod?id=1", 4000L),
      Event("Mary", "./prod?id=3", 6000L),
      Event("Mary", "./prod?id=2", 5000L)
    )


    //自定义一个RichMapFunction,测试富函数类的功能
    stream.map( new MyRichMap )
      .print()

    env.execute()


  }

  class MyRichMap() extends RichMapFunction[Event, Long]{

    override def open(parameters: Configuration): Unit =
      println("索引号为"+getRuntimeContext.getIndexOfThisSubtask + "的任务开始")

    override def map(in: Event): Long = in.timestamp

    override def close(): Unit =
      println("索引号为"+getRuntimeContext.getIndexOfThisSubtask + "的任务结束")
  }

}


TransformUDFTest

package com.liao.chapter05

import org.apache.flink.api.common.functions.FilterFunction
import org.apache.flink.streaming.api.scala._

object TransformUDFTest {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)

    val stream :DataStream[Event]= env.fromElements(
      Event("Mary", "./home", 1000L),
      Event("Bob", "./cart", 2000L),
      Event("Bob", "./cart", 3000L),
      Event("Alice", "./cart", 3000L),
      Event("Mary", "./prod?id=1", 4000L),
      Event("Mary", "./prod?id=3", 6000L),
      Event("Mary", "./prod?id=2", 5000L)
    )


    //测试UDF的用法,筛选url中包含某个关键字home的Event事件
    //1.实现一个自定义的函数类
    stream.filter(new MyFilterFunction("prod"))
      .print("1")

    //2.使用匿名类
    stream.filter( new FilterFunction[Event] {
      override def filter(t: Event): Boolean = t.url.contains("prod")
    } )
      .print("2")


    //3.使用匿名函数
    stream.filter(_.url.contains("prod"))
        .print("3")

    env.execute()

  }

  //实现自定义的FilterFunction
  class MyFilterFunction(keyWord: String) extends FilterFunction[Event]{
    override def filter(t: Event): Boolean = t.url.contains(keyWord)
  }
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值