import org.apache.flink.streaming.api.functions.source.SourceFunction
/**
* @Author: wpp
* @Date: 2020/3/13 18:28
*
*/
class MyNoParallelScala extends SourceFunction[String]{
var count=1l
var isFlag=true
override def run(ctx: SourceFunction.SourceContext[String]) = {
while(isFlag){
println("生产的数据",count)
ctx.collect("生产的数据" +count)
count+=1
Thread.sleep(100)
}
}
override def cancel()= {
isFlag=false
}
}
先自定义一个source
2,针对生成的数据批量写入到hdfs中,默认一个小时写一个文件
import java.io.File
import java.util
import java.util.{Date, Properties}
import com.alibaba.fastjson.JSON
import org.apache.commons.lang3.time.DateFormatUtils
import org.apache.flink.api.common.restartstrategy.RestartStrategies
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.api.common.state.MapStateDescriptor
import org.apache.flink.api.common.typeinfo.BasicTypeInfo
import org.apache.flink.api.java.utils.ParameterTool
import org.apache.flink.contrib.streaming.state.RocksDBStateBackend
import org.apache.flink.streaming.api.scala.{StreamExecutionEnvironment, _}
import org.apache.flink.streaming.connectors.kafka.{FlinkKafkaConsumer010, FlinkKafkaProducer010}
import org.json4s.native.Serialization
import scala.collection.JavaConverters._
import org.apache.flink.streaming.connectors.fs.bucketing.BucketingSink
import org.apache.flink.streaming.connectors.fs.bucketing.DateTimeBucketer
import java.time.ZoneId
import com.crgt.gtdata.pvuv.MyNoParallelScala
/**
* 运行中的车辆进行正晚点计算,
*/
object TestIntoHdfs {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
var params = ParameterTool.fromPropertiesFile(TestIntoHdfs.getClass.getClassLoader.getResourceAsStream("app-dev.properties"))
if ( !";" .equals(File.pathSeparator)) {
params = ParameterTool.fromPropertiesFile(TestIntoHdfs.getClass.getClassLoader.getResourceAsStream("app-prod.properties"))
val rock = new RocksDBStateBackend("hdfs:///jobs/checkpoints/TestIntoHdfs")
env.setStateBackend(rock)
//二分钟一次备份
env.enableCheckpointing(120000)
env.getConfig.setRestartStrategy(RestartStrategies.fixedDelayRestart(60, 60000))
env.getConfig.setGlobalJobParameters(params)
}
//二分钟一次备份
val descriptor = new MapStateDescriptor("dynamicConfig", BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.STRING_TYPE_INFO)
env.fromCollection(params.getRequired("event-id2").split(",")).broadcast(descriptor)
val bootstrap_servers = params.getRequired("bootstrap.servers2")
val topics = params.getRequired("topic-gps").split(",").toSeq.asJava
val topicsDelay = params.getRequired("topic-gps-delay-time")
val group_id = params.getRequired("group-id-gps-delay")
val properties = new Properties
properties.setProperty("bootstrap.servers", bootstrap_servers)
properties.setProperty("group.id", group_id)
properties.setProperty("enable.auto.commit", "true")
properties.setProperty("auto.commit.interval.ms", "6000")
val kafkaConsumer = new FlinkKafkaConsumer010(
topics,
new SimpleStringSchema,
properties)
.setStartFromLatest()
val data: DataStream[String] = env
// .addSource(kafkaConsumer)
.addSource(new MyNoParallelScala())
.map(row => {
print(row)
row
})
// val hadoopSink = new BucketingSink[String]("hdfs://gtdata-test01:9000/testw/")
// val hadoopSink = new BucketingSink[String]("hdfs://gtdata-test01:8020/testw/")
val hadoopSink = new BucketingSink[String]("hdfs://gtdata-test01:8020/testw/")
// 使用东八区时间格式"yyyy-MM-dd--HH"命名存储区
hadoopSink.setBucketer(new DateTimeBucketer[String]("yyyy-MM-dd--HH", ZoneId.of("Asia/Shanghai")))
// 下述两种条件满足其一时,创建新的块文件
// 条件1.设置块大小为100MB
hadoopSink.setBatchSize(1024 * 1024 * 100)
// 条件2.设置时间间隔20min
hadoopSink.setBatchRolloverInterval(20 * 60 * 1000)
// 设置块文件前缀
hadoopSink.setPendingPrefix("")
// 设置块文件后缀
hadoopSink.setPendingSuffix("")
// 设置运行中的文件前缀
hadoopSink.setInProgressPrefix(".")
data.addSink(hadoopSink)
//参数分别是:写入topic,序列化器,kafka配置惨
// data.addSink(new FlinkKafkaProducer010[String](bootstrap_servers, topicsDelay, new SimpleStringSchema()) )
env.execute("TestIntoHdfs-wpp")
}
}
4,
当数据到达时,分段接收器将按当前系统时间拆分,并使用日期时间模式"yyyy-MM-dd--HH"
命名存储区。这种模式传递给DateTimeFormatter
使用当前系统时间和东八时区(上海)来形成存储桶路径。每当遇到新日期时,都会创建一个新存储桶。每个存储桶本身都是一个包含多个块文件的目录:接收器的每个并行实例将创建自己的块文件,当块文件超过100MB或超过20分钟时,接收器也会创建新的块文件。当存储桶变为非活动状态(非in-progress状态)时,将刷新并关闭打开的部件文件。如果存储桶最近未写入,则视为非活动状态。默认情况下,接收器每分钟检查一次非活动存储桶,并关闭任何超过一分钟未写入的存储桶。