实时做数据的ETL

最新推荐文章于 2023-06-12 14:18:51 发布

yeah_you_are

最新推荐文章于 2023-06-12 14:18:51 发布

阅读量573

点赞数

分类专栏： spark 文章标签： kafka 大数据 spark mysql

本文链接：https://blog.csdn.net/yeah_you_are/article/details/118229992

版权

spark 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

实时做数据的ETL

题目

数据格式：
<<<!>>>3111<<<!>>> --->记录id 
<<<!>>>238<<<!>>> --->userid 
<<<!>>>20181111132902<<<!>>> --->用户操作时间 
<<<!>>>58.223.1.112<<<!>>> --->用户ip地址 
<<<!>>>202.102.92.18<<<!>>> --->用户访问ip地址 
<<<!>>>59947<<<!>>> --->用户端口 
<<<!>>>80<<<!>>> --->服务端口 
<<<!>>>www.sumecjob.com<<<!>>> -->服务域名 
<<<!>>><<<!>>> 
<<<!>>><<<!>>> 
<<<!>>><<<!>>> 
<<<!>>><<<!>>> 
<<<!>>><<<!>>> 
<<<!>>>http://www.sumecjob.com/Social.aspx<<<!>>> -->用户访问的具体url 
<<<!>>>2556928066<<<!>>>
需求：
	从kafka指定的topic中读取如上数据，进行清洗，剔除上述无用字段，保留有用信息，包括userid 用 户操作时间(timestamp) 用户ip地址:端口 服务地址：服务端口 url,最后将清洗结果送回kafka指定的 topic中，完成在线etl。 
	1. 所有的<<<!>>>直接替换为空串 
	2. 所有的日期替换为：yyyy-MM-dd 
	3. 使用mysql手动保存偏移量

保存偏移量的数据表设计

字段	数据类型	备注
key	String	以kafka主题topic和消费者群标识groupId(不是主键)
partition	Integer	kafka主题中的分区号
offset	Integer	偏移量

实现代码：

主函数：

object Demo3_test extends LoggerTrait{
    def main(args: Array[String]): Unit = {
        val streamingContext = new StreamingContext(new SparkConf().setMaster("local[*]").setAppName("Demo1_DirectStream"), Seconds(2))

        // 关于kafka的一些参数
        val kafkaParams = Map[String, String](
            "bootstrap.servers" -> "hadoopHyy:9092", //kafka链接端口
            "group.id" -> "hyy",                     //信息消费群组的id
            "auto.offset.reset" -> "smallest"        //设置从最开始的消息开始消费
        )

        val topics: Set[String] = "hadoop".split(",").toSet //定义接收数据的kafka主题
        val data: InputDStream[(String, String)] = getStreaming(topics,kafkaParams,streamingContext)

        // 获得生产对象

        data.foreachRDD((rdd,time) => {
            println(s"----------- ${time} ---------------")
            rdd.foreach(result => {
                //获得消息主体
                val messageSource = result._2
                //替换<<<!>>>为空格
                val message = messageSource.replace("<<<!>>>"," ")
                //按照空格、制表符等分割字符串，并过滤空字符串
                val strings: Array[String] = message.split("\\s+").filter(!_.equals(""))
                //将时间戳转换为YYYY-MM-dd形式的日期
                val date = getDate(strings(2))
                //新数据的格式
                val msg = s"${strings(0)} ${strings(1)} ${date} ${strings(3)} ${strings(4)} ${strings(5)} ${strings(6)} ${strings(7)} ${strings(8)} ${strings(9)} "

                // 生产者发送消息
                // 获得生产者对象
                val producer: KafkaProducer[String, String] = Demo2_Producer.getKafkaProducer()
                if (producer != null) {
                    // 将新数据发送到kafka的flume-kafka主题中
                    Demo2_Producer.send(producer,"flume-kafka",msg)
                    // 回收生产者对象
                    Demo2_Producer.recovery(producer)
                }
            })
            val ranges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
            for (range <- ranges){
                // 获得偏移量的移动总数
                val count: Long = range.count()
                // 得到偏移量的现在位置
                val offset: Long = range.untilOffset + 1
                // 获得主题
                val topic: String = range.topic
                // 获得分区
                val partition: Int = range.partition
                println(s"count = ${count}")
                // 如果偏移量发生变动，则改变数据库中保存的偏移量
                if (count > 0)
                    MySQLConnection.saveOffset(topic,kafkaParams("group.id"),partition.toString,offset.toInt)
            }
        })

        //关闭生产者
        Demo2_Producer.close()

        streamingContext.start()
        streamingContext.awaitTermination()
    }

    // 获得InputDStream对象
    def getStreaming(topics:Set[String],kafkaParams:Map[String, String],context:StreamingContext):InputDStream[(String, String)] = {
        // 从数据库中获得偏移量数据
        val offsets: Map[TopicAndPartition, Long] = getFromOffset(topics,kafkaParams("group.id"))
        // 声明InputDStream对象的变量
        var data: InputDStream[(String, String)] = null
        //如果数据库中有偏移量数据，按照查询到的偏移量读取数据
        if(!offsets.isEmpty){
            val messageHandler = (msgHandler:MessageAndMetadata[String, String]) => (msgHandler.key(), msgHandler.message())
            data = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder,(String, String)](context,kafkaParams,offsets,messageHandler)
        //如果数据库中偏移量数据为空，直接采用kafka中偏移量读取数据
        }else{
            data= KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
                context,
                kafkaParams,
                topics
            )
        }
        data
    }
	
    //获得各主题分区的偏移量数据
    def getFromOffset(topic:Set[String],groupId:String): Map[TopicAndPartition, Long] ={
        // 声明一个map保存偏移量数据
        val offsets: mutable.Map[TopicAndPartition, Long] = mutable.Map[TopicAndPartition, Long]()
        //循环主题
        topic.foreach( topic => {
            val partitionToInteger: util.Map[TopicAndPartition, Integer] = MySQLConnection.getOffset(topic,groupId)
            val partition2Topic: mutable.Map[TopicAndPartition, Integer] = JavaConversions.mapAsScalaMap(partitionToInteger)
            partition2Topic.foreach(result => {
                offsets.put(result._1,result._2.toLong)
            })
        })
        return offsets.toMap
    }

    /**
     * 日期转换
     * @param time
     * @return
     */
    def getDate(time:String):String = {
        val d = new Date(time.toLong)
        new SimpleDateFormat("YYYY-MM-dd").format(d)
    }
}

消息生产封装

object Demo2_Producer {
    // 声明生产者池保存生产者对象
    private val pool:util.LinkedList[KafkaProducer[String,String]] = new util.LinkedList[KafkaProducer[String, String]]()
    private val properties = {
        // 读取生产者配置参数
        val pro = new Properties()
        pro.load(Demo2_Producer.getClass.getClassLoader.getResourceAsStream("producer.properties"))
        pro
    }

    //获得生产者对象
    def getKafkaProducer():KafkaProducer[String,String] = {
        if (pool.size() <= 0){
            for( i <- 0.to(10)){
                pool.push(new KafkaProducer[String,String](properties))
            }
        }
        pool.poll()
    }
    
    //由生产者对象发送消息
    def send(kafkaProducer: KafkaProducer[String,String],topic:String,message:String): Unit = {
        val record:ProducerRecord[String,String] = new ProducerRecord(topic,message)
        kafkaProducer.send(record)
        println("message send successful...")
    }
    
    //回收生产者对象
    def recovery(kafkaProducer: KafkaProducer[String,String])={
        pool.push(kafkaProducer)
    }
    //关闭所有生产者对象
    def close(): Unit ={
        for( i <- 0.to(10)){
            val producer = pool.poll()
            if(producer != null)
                producer.close()
        }
    }
}

链接数据库，保存偏移量的类：

public class MySQLConnection {
    private static String url = "jdbc:mysql://localhost:3306/vote_web?serverTimezone=UTC&useUnicode=true&useSSL=false";
    private static String username = "root";
    private static String password = "123456";

    public static Connection getConnection() {
        Connection connection = null;
        try {
            Class.forName("com.mysql.jdbc.Driver");
            connection = DriverManager.getConnection(url, username, password);

        } catch (Exception e) {
            e.printStackTrace();
        }
        return connection;
    }

    //据主题、消费者id查询偏移量
    public static Map<TopicAndPartition,Integer> getOffset(String topic, String groupId){
        Map<TopicAndPartition,Integer> result = new HashMap<TopicAndPartition, Integer>();
        // 以主题及消费者id为值
        String key = topic + "_" + groupId ;
        Connection connection = getConnection();
        try {
            Statement statement = connection.createStatement();
            String sql = "select `offset`,`partition` from `offset` where `key` = '" + key +"'";
            ResultSet resultSet = statement.executeQuery(sql);
            while (resultSet.next()){
                Integer offset = resultSet.getInt(1);
                String partition = resultSet.getString(2);
                result.put(TopicAndPartition.apply(topic,Integer.parseInt(partition)),offset);
            }
            connection.close();
        } catch (SQLException e) {
            e.printStackTrace();
        }
        return result;
    }

    //保存偏移量
    public static void saveOffset(String topic,String groupId,String partition,int offset) {
        String key = topic + "_" + groupId ;
        Connection connection = getConnection();
        try {
            Statement statement = connection.createStatement();
            String sql = "update `offset` set `offset` = " + offset + " where `key` = '"+ key +"' and `partition` = " + partition ;
            statement.execute(sql);
        }catch (SQLException e) {
            e.printStackTrace();
        }
    }

    @Test
    public void test() {
//        Map<TopicAndPartition, Integer> offset = MySQLConnection.getOffset("hadoop", "hyy");
//        for (Map.Entry<TopicAndPartition, Integer> e : offset.entrySet()) {
//            System.out.println(e.getKey().topic() + "==" + e.getValue());
//        }
        MySQLConnection.saveOffset("hadoop","hyy","0",1);
    }
}