1.flink source
(1)文件数据源
val wordAgg: AggregateDataSet[(String, Int)] = env.readTextFile("./test.txt").flatMap(_.split(" ")).map((_,1)).groupBy(0).sum(1)
wordAgg.print()
(2)hdfs 数据源
val hdfsDS: DataSet[String] = env.readTextFile("hdfs://node01:8020/test.txt")
(3)kafka 数据源
prop.setProperty("bootstrap.servers", "192.168.1.200:9092,192.168.1.201:9092,192.168.1.202:9092")
prop.setProperty("group.id", "order")
prop.setProperty("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
prop.setProperty("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
prop.setProperty("auto.offset.reset", "latest")
val orderDS: DataStream[String] = env.addSource(new FlinkKafkaConsumer011[String]("order", new SimpleStringSchema(), prop))
(4) 数据库就不一一列举.
2.Flink sink
(1)写到本地的文件中
val hdfsDS: DataSet[String] = env.readTextFile("./test.txt")
hdfsDS.writeAsText("./output/1.txt")
(2)写到HDFS中的文件
wordDataSet.writeAsText("hdfs://cdh1:8020/testData/1.txt")
(3)hbase sink
class myHBaseSink(tableName: String, family: String) extends RichSinkFunction[Order] {
//定义常量
var conn: Connection = _
//打开连接的方法
override def open(parameters: Configuration): Unit = {
super.open(parameters)
val conf = HBaseConfiguration.create()
conf.set(HConstants.ZOOKEEPER_QUORUM, "192.168.1.200,192.168.1.201,192.168.1.202")
conn = ConnectionFactory.createConnection(conf)
}
//定义数据参数
override def invoke(value: Order , context: SinkFunction.Context[_]): Unit =
{
val put: Put = new Put(Bytes.toBytes(value.orderId))
val table: Table = conn.getTable(TableName.valueOf(tableName))
put.addColumn(Bytes.toBytes(family), Bytes.toBytes("orderId"), Bytes.toBytes(value.orderId))
put.addColumn(Bytes.toBytes(family), Bytes.toBytes("userId"), Bytes.toBytes(value.userId))
put.addColumn(Bytes.toBytes(family), Bytes.toBytes("money"), Bytes.toBytes(value.money))
put.addColumn(Bytes.toBytes(family), Bytes.toBytes("timestamp"), Bytes.toBytes(value.timestamp))
table.put(put)
table.close()
}
//关闭连接
override def close(): Unit = {
super.close()
conn.close()
}
}
3.转换算子
(1)map算子
val textDataSet: DataSet[String] = env.fromCollection(
List("1,张三", "2,李四", "3,王五", "4,赵六")
)
textDataSet.map {
text =>
val fieldArr = text.split(",")
Users(fieldArr(0), fieldArr(1))
}.writeAsText("./output/2.txt")
(2)flatmap 算子
userDataSet.flatMap{t=>var word=t.split(",")
List(
(word(0),word(1)),
(word(0),word(1)+word(2)),
(word(0),word(1)+word(2)+word(3)))
}.writeAsText("./output/3.txt")
(3)filter 算子
val wordDataSet = env.fromCollection(List("hadoop", "hive", "spark", "flink")
wordDataSet.filter{t=>t.startsWith("h")}.print()
(4)reduce 算子
val wordCountDataSet = env.fromCollection(List(("java" , 1) , ("java", 1) ,("java" , 1) ))
wordCountDataSet.reduce{ (x,y)=>(x._1,x._2+y._2)}.print()}
(5)groupby 算子
val wordcountDataSet = env.fromCollection(List(("java" , 1) , ("java", 1) ,("scala" , 1) ))
wordcountDataSet.groupBy(x=>x._1).reduce{
(x,y)=>(x._1,x._2+y._2)}.print()}
(6) reduceGroup 算子
val wordcountDataSet = env.fromCollection(List(("java" , 1) , ("java", 1) ,("scala" , 1) ))
wordcountDataSet.groupBy(x=>x._1).reduceGroup{
iter =>iter.reduce{
(x,y) =>(x._1,x._2+y._2) } }.print()
reduce是将数据一个个拉取到另外一个节点,然后再执行计算
reduceGroup是先在每个group所在的节点上执行计算,然后再拉取
(7)aggregate 算子
val wordcountDataSet = env.fromCollection(List(("java", 1), ("java", 1), ("scala", 1)))
val resultDataSet = wordcountDataSet.aggregate(Aggregations.SUM, 1).print()
(8) distinct
val wordcountDataSet = env.fromCollection(List(("java", 1), ("java", 1), ("scala", 1)))
wordcountDataSet.distinct(0).print()
(9)join
val scoreDataSet = env.readCsvFile[score]("D:\\ideaProject\\flink-base\\output\\1.csv")
val subjectDataSet = env.readCsvFile[subject]("D:\\ideaProject\\flink-base\\output\\2.csv")
val joinedDataSet = scoreDataSet.join(subjectDataSet).where(2).equalTo(0).print()
(10)union
val wordDataSet1 = env.fromCollection(List("hadoop", "hive", "flume"))
val wordDataSet2 = env.fromCollection(List("hadoop", "hive", "spark"))
val resultDataSet = wordDataSet1.union(wordDataSet2)
(11) rebalace
val env = ExecutionEnvironment.getExecutionEnvironment
val numDS: DataSet[Long] = env.generateSequence(0,100)
val resultDataSet= numDS.filter(_>8).rebalance().map(new RichMapFunction[Long,(Long,Long)] {
override def map(in: Long): (Long, Long) = {
(getRuntimeContext.getIndexOfThisSubtask, in)
}
})
resultDataSet.print()
(12)hashPartition 按照key进行分区
val numDataSet = env.fromCollection(List(1,1,1,1,1,1,1,2,2,2,2,2))
val partitionDataSet: DataSet[Int] = numDataSet.partitionByHash(_.toString)
(13) sortPartition 根据key 对分区中的数据进行排序
val sortedDataSet = wordDataSet.sortPartition(_.toString, Order.DESCENDING)
3.广播机制
把原本每个插槽都要拉取的共享交集 使用广播的方式将共享交集变为公共的资源.
val studentDataSet = env.fromCollection(List((1, "张三"), (2, "李四"), (3, "王五")))
val scoreDataSet = env.fromCollection(List((1, "语文", 50), (2, "数学", 70), (3, "英文", 86)))
// 将成绩数据(学生ID,学科,成绩) -> (学生姓名,学科,成绩)
val resultDataSet = scoreDataSet.map(new RichMapFunction[(Int, String, Int), (String, String, Int)] {
var bc_studentList: List[(Int, String)] = _
// 获取广播数据
override def open(parameters: Configuration): Unit = {
import scala.collection.JavaConverters._
bc_studentList = getRuntimeContext.getBroadcastVariable[(Int, String)]("bc_student").asScala.toList }
// 使用广播进行转换
override def map(value: (Int, String, Int)): (String, String, Int) = {
(bc_studentList.filter(_._1 == value._1)(0)._2, value._2, value._3)
}
}).withBroadcastSet(studentDataSet, "bc_student")
resultDataSet.print()
val env = ExecutionEnvironment.getExecutionEnvironment
val s1 = env.fromElements("1", "2", "3", "4", "5")
val s2 = env.fromElements("a", "b", "c", "d", "e")
s1.map(new RichMapFunction[String,(String,String)] {
var s2: Traversable[String]= null
override def open(parameters: Configuration){
import collection.JavaConverters._
s2 = getRuntimeContext.getBroadcastVariable[String]("ds2_broadCast").asScala
}
override def map(in: String): (String, String) = {
var r =""
for (v <- s2){
r=r+v+""
}
(in,r)
}
}).withBroadcastSet(s2,"ds2_broadCast").print()}}
4.Flink提供了一个类似于Hadoop的分布式缓存。前面讲的广播,是将一个DataSet广播到每一个TaskManager的内存 中。而Distributed Cache是从外部加载一个文件/目录(例如:HDFS),然后分别复制到每一个TaskManager的本地磁 盘中。
object distributeFile {
def main(args: Array[String]): Unit = {
val env: ExecutionEnvironment = ExecutionEnvironment.getExecutionEnvironment
val socreDS: DataSet[(Int, String, Int)] = env.fromCollection(List((1, "语文", 50), (2, "数学", 70), (3, "英文", 86)))
env.registerCachedFile("D:\\ideaProject\\flink-base\\output\\distribute_cache_student", "cache_student")
socreDS.map(new RichMapFunction[(Int, String, Int), (String, String, Int)] {
var cache_userList: List[(Int, String)] = _
override def open(parameters: Configuration): Unit = {
val cache_student: File = getRuntimeContext.getDistributedCache.getFile("cache_student")
val lines: Iterator[String] = Source.fromFile(cache_student).getLines()
cache_userList = lines.map(line => {
val lin: Array[String] = line.split(",")
(lin(0).toInt, lin(1))
}).toList}
override def map(in: (Int, String, Int)): (String, String, Int) = {
(cache_userList.filter(_._1 == in._1)(0)._2, in._2, in._3)
}
}).print()}}