实时做数据的ETL
题目
数据格式:
<<<!>>>3111<<<!>>> --->记录id
<<<!>>>238<<<!>>> --->userid
<<<!>>>20181111132902<<<!>>> --->用户操作时间
<<<!>>>58.223.1.112<<<!>>> --->用户ip地址
<<<!>>>202.102.92.18<<<!>>> --->用户访问ip地址
<<<!>>>59947<<<!>>> --->用户端口
<<<!>>>80<<<!>>> --->服务端口
<<<!>>>www.sumecjob.com<<<!>>> -->服务域名
<<<!>>><<<!>>>
<<<!>>><<<!>>>
<<<!>>><<<!>>>
<<<!>>><<<!>>>
<<<!>>><<<!>>>
<<<!>>>http://www.sumecjob.com/Social.aspx<<<!>>> -->用户访问的具体url
<<<!>>>2556928066<<<!>>>
需求:
从kafka指定的topic中读取如上数据,进行清洗,剔除上述无用字段,保留有用信息,包括userid 用 户操作时间(timestamp) 用户ip地址:端口 服务地址:服务端口 url,最后将清洗结果送回kafka指定的 topic中,完成在线etl。
1. 所有的<<<!>>>直接替换为空串
2. 所有的日期替换为:yyyy-MM-dd
3. 使用mysql手动保存偏移量
保存偏移量的数据表设计
字段 | 数据类型 | 备注 |
---|---|---|
key | String | 以kafka主题topic和消费者群标识groupId(不是主键) |
partition | Integer | kafka主题中的分区号 |
offset | Integer | 偏移量 |
实现代码:
主函数:
object Demo3_test extends LoggerTrait{
def main(args: Array[String]): Unit = {
val streamingContext = new StreamingContext(new SparkConf().setMaster("local[*]").setAppName("Demo1_DirectStream"), Seconds(2))
// 关于kafka的一些参数
val kafkaParams = Map[String, String](
"bootstrap.servers" -> "hadoopHyy:9092", //kafka链接端口
"group.id" -> "hyy", //信息消费群组的id
"auto.offset.reset" -> "smallest" //设置从最开始的消息开始消费
)
val topics: Set[String] = "hadoop".split(",").toSet //定义接收数据的kafka主题
val data: InputDStream[(String, String)] = getStreaming(topics,kafkaParams,streamingContext)
// 获得生产对象
data.foreachRDD((rdd,time) => {
println(s"----------- ${time} ---------------")
rdd.foreach(result => {
//获得消息主体
val messageSource = result._2
//替换<<<!>>>为空格
val message = messageSource.replace("<<<!>>>"," ")
//按照空格、制表符等分割字符串,并过滤空字符串
val strings: Array[String] = message.split("\\s+").filter(!_.equals(""))
//将时间戳转换为YYYY-MM-dd形式的日期
val date = getDate(strings(2))
//新数据的格式
val msg = s"${strings(0)} ${strings(1)} ${date} ${strings(3)} ${strings(4)} ${strings(5)} ${strings(6)} ${strings(7)} ${strings(8)} ${strings(9)} "
// 生产者发送消息
// 获得生产者对象
val producer: KafkaProducer[String, String] = Demo2_Producer.getKafkaProducer()
if (producer != null) {
// 将新数据发送到kafka的flume-kafka主题中
Demo2_Producer.send(producer,"flume-kafka",msg)
// 回收生产者对象
Demo2_Producer.recovery(producer)
}
})
val ranges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
for (range <- ranges){
// 获得偏移量的移动总数
val count: Long = range.count()
// 得到偏移量的现在位置
val offset: Long = range.untilOffset + 1
// 获得主题
val topic: String = range.topic
// 获得分区
val partition: Int = range.partition
println(s"count = ${count}")
// 如果偏移量发生变动,则改变数据库中保存的偏移量
if (count > 0)
MySQLConnection.saveOffset(topic,kafkaParams("group.id"),partition.toString,offset.toInt)
}
})
//关闭生产者
Demo2_Producer.close()
streamingContext.start()
streamingContext.awaitTermination()
}
// 获得InputDStream对象
def getStreaming(topics:Set[String],kafkaParams:Map[String, String],context:StreamingContext):InputDStream[(String, String)] = {
// 从数据库中获得偏移量数据
val offsets: Map[TopicAndPartition, Long] = getFromOffset(topics,kafkaParams("group.id"))
// 声明InputDStream对象的变量
var data: InputDStream[(String, String)] = null
//如果数据库中有偏移量数据,按照查询到的偏移量读取数据
if(!offsets.isEmpty){
val messageHandler = (msgHandler:MessageAndMetadata[String, String]) => (msgHandler.key(), msgHandler.message())
data = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder,(String, String)](context,kafkaParams,offsets,messageHandler)
//如果数据库中偏移量数据为空,直接采用kafka中偏移量读取数据
}else{
data= KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
context,
kafkaParams,
topics
)
}
data
}
//获得各主题分区的偏移量数据
def getFromOffset(topic:Set[String],groupId:String): Map[TopicAndPartition, Long] ={
// 声明一个map保存偏移量数据
val offsets: mutable.Map[TopicAndPartition, Long] = mutable.Map[TopicAndPartition, Long]()
//循环主题
topic.foreach( topic => {
val partitionToInteger: util.Map[TopicAndPartition, Integer] = MySQLConnection.getOffset(topic,groupId)
val partition2Topic: mutable.Map[TopicAndPartition, Integer] = JavaConversions.mapAsScalaMap(partitionToInteger)
partition2Topic.foreach(result => {
offsets.put(result._1,result._2.toLong)
})
})
return offsets.toMap
}
/**
* 日期转换
* @param time
* @return
*/
def getDate(time:String):String = {
val d = new Date(time.toLong)
new SimpleDateFormat("YYYY-MM-dd").format(d)
}
}
消息生产封装
object Demo2_Producer {
// 声明生产者池保存生产者对象
private val pool:util.LinkedList[KafkaProducer[String,String]] = new util.LinkedList[KafkaProducer[String, String]]()
private val properties = {
// 读取生产者配置参数
val pro = new Properties()
pro.load(Demo2_Producer.getClass.getClassLoader.getResourceAsStream("producer.properties"))
pro
}
//获得生产者对象
def getKafkaProducer():KafkaProducer[String,String] = {
if (pool.size() <= 0){
for( i <- 0.to(10)){
pool.push(new KafkaProducer[String,String](properties))
}
}
pool.poll()
}
//由生产者对象发送消息
def send(kafkaProducer: KafkaProducer[String,String],topic:String,message:String): Unit = {
val record:ProducerRecord[String,String] = new ProducerRecord(topic,message)
kafkaProducer.send(record)
println("message send successful...")
}
//回收生产者对象
def recovery(kafkaProducer: KafkaProducer[String,String])={
pool.push(kafkaProducer)
}
//关闭所有生产者对象
def close(): Unit ={
for( i <- 0.to(10)){
val producer = pool.poll()
if(producer != null)
producer.close()
}
}
}
链接数据库,保存偏移量的类:
public class MySQLConnection {
private static String url = "jdbc:mysql://localhost:3306/vote_web?serverTimezone=UTC&useUnicode=true&useSSL=false";
private static String username = "root";
private static String password = "123456";
public static Connection getConnection() {
Connection connection = null;
try {
Class.forName("com.mysql.jdbc.Driver");
connection = DriverManager.getConnection(url, username, password);
} catch (Exception e) {
e.printStackTrace();
}
return connection;
}
//据主题、消费者id查询偏移量
public static Map<TopicAndPartition,Integer> getOffset(String topic, String groupId){
Map<TopicAndPartition,Integer> result = new HashMap<TopicAndPartition, Integer>();
// 以主题及消费者id为值
String key = topic + "_" + groupId ;
Connection connection = getConnection();
try {
Statement statement = connection.createStatement();
String sql = "select `offset`,`partition` from `offset` where `key` = '" + key +"'";
ResultSet resultSet = statement.executeQuery(sql);
while (resultSet.next()){
Integer offset = resultSet.getInt(1);
String partition = resultSet.getString(2);
result.put(TopicAndPartition.apply(topic,Integer.parseInt(partition)),offset);
}
connection.close();
} catch (SQLException e) {
e.printStackTrace();
}
return result;
}
//保存偏移量
public static void saveOffset(String topic,String groupId,String partition,int offset) {
String key = topic + "_" + groupId ;
Connection connection = getConnection();
try {
Statement statement = connection.createStatement();
String sql = "update `offset` set `offset` = " + offset + " where `key` = '"+ key +"' and `partition` = " + partition ;
statement.execute(sql);
}catch (SQLException e) {
e.printStackTrace();
}
}
@Test
public void test() {
// Map<TopicAndPartition, Integer> offset = MySQLConnection.getOffset("hadoop", "hyy");
// for (Map.Entry<TopicAndPartition, Integer> e : offset.entrySet()) {
// System.out.println(e.getKey().topic() + "==" + e.getValue());
// }
MySQLConnection.saveOffset("hadoop","hyy","0",1);
}
}
测试结果:
测试成功,nice~