import java.time.format.DateTimeFormatter
import java.time.{Instant, LocalDateTime, ZoneId}
import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Minutes, StreamingContext}
/**
* @auther FRX
* @date 2020/7/8
* 项目描述:sparkstreaming,每十分钟读取一次数据,按天+小时分区存入
*/
object FileToHdfs {
def main(args: Array[String]): Unit = {
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010._
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "localhost:9092,anotherhost:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "use_a_separate_group_id_for_each_stream",
"auto.offset.reset" -> "latest",
"enable.auto.commit" -> (false: java.lang.Boolean)
)
val conf = new SparkConf().setAppName("AppstoreVideoInfo")
.set("spark.hadoop.validateOutputSpecs","false")//如果文件路径存在则覆盖,因为是天+小时+分钟级别的,所以正常情况下不会有相同的文件名
val ssc = new StreamingContext(conf, Minutes(10))
ssc.sparkContext.setLogLevel("ERROR")
val topics = Array("topicA", "topicB")
val stream = KafkaUtils.createDirectStream[String, String](
ssc,
PreferConsistent,
Subscribe[String, String](topics, kafkaParams)
)
val tranData = stream.map(x=>(x.value(),""))
tranData.repartition(1).foreachRDD(result=>{
result.saveAsHadoopFile("输出路径", classOf[String], classOf[String],classOf[RDDMultipleTextOutputFormat])
})
ssc.start()
ssc.awaitTermination()
}
def coverTimeStampToString(time:Long): String ={
val format = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")
format.format(LocalDateTime.ofInstant(Instant.ofEpochMilli(time),ZoneId.systemDefault()))
}
}
class RDDMultipleTextOutputFormat extends MultipleTextOutputFormat[Any, Any]{
override def generateFileNameForKeyValue(key: Any, value: Any, name: String):String ={
val timeStamp = System.currentTimeMillis()
coverTimeStampToString(timeStamp)
val ymd=getDay(timeStamp)
val hour=getHour(timeStamp)
val service_date="day="+ymd+"/"+"hour="+hour+"/"+name+"_"+getMinute(timeStamp)//输出路径:D:\test\streaming\day=2020-07-10\hour=17\part-00000_26
service_date
}
def getHour(time:Long): String ={
coverTimeStampToString(time).substring(11,13)
}
def getDay(time:Long): String ={
coverTimeStampToString(time).substring(0,10)
}
def getMinute(time:Long): String ={
coverTimeStampToString(time).substring(14,16)
}
}
sparkstreaming 读取kafka数据,写入hdfs,使用saveAsHadoopFile
最新推荐文章于 2022-07-03 20:06:27 发布