Druid实际应用 配合flink 接收处理kafka数据
1.druid重要概念 roll-up聚合
druid可以汇总原始数据。汇总是就是把选定的相同维度的数据进行聚合操作,可减少存储的大小。 druid要求数据具有如下要求,数据分为三部分:时间戳,维度列,指标列 以下数据
{"timestamp":"2018-01-01T01:01:35Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2","packets":20,"bytes":9024}
{"timestamp":"2018-01-01T01:01:51Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2","packets":255,"bytes":21133}
{"timestamp":"2018-01-01T01:01:59Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2","packets":11,"bytes":5780}
{"timestamp":"2018-01-01T01:02:14Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2","packets":38,"bytes":6289}
{"timestamp":"2018-01-01T01:02:29Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2","packets":377,"bytes":359971}
{"timestamp":"2018-01-01T01:03:29Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2","packets":49,"bytes":10204}
{"timestamp":"2018-01-02T21:33:14Z","srcIP":"7.7.7.7", "dstIP":"8.8.8.8","packets":38,"bytes":6289}
{"timestamp":"2018-01-02T21:33:45Z","srcIP":"7.7.7.7", "dstIP":"8.8.8.8","packets":123,"bytes":93999}
{"timestamp":"2018-01-02T21:35:45Z","srcIP":"7.7.7.7", "dstIP":"8.8.8.8","packets":12,"bytes":2818}
如果该数据被druid加载并且我们打开了聚合功能,聚合的要求是packets和bytes对应的数据进行累加,并且对条数进行计数(具体设置操作后续加载数据时讲解),聚合后的数据如下:
┌──────────────────────────┬────────┬───────┬─────────┬─────────┬─────────┐
│ __time │ bytes │ count │ dstIP │ packets │ srcIP │
├──────────────────────────┼────────┼───────┼─────────┼─────────┼─────────┤
│ 2018-01-01T01:01:00.000Z │ 35937 │ 3 │ 2.2.2.2 │ 286 │ 1.1.1.1 │
│ 2018-01-01T01:02:00.000Z │ 366260 │ 2 │ 2.2.2.2 │ 415 │ 1.1.1.1 │
│ 2018-01-01T01:03:00.000Z │ 10204 │ 1 │ 2.2.2.2 │ 49 │ 1.1.1.1 │
│ 2018-01-02T21:33:00.000Z │ 100288 │ 2 │ 8.8.8.8 │ 161 │ 7.7.7.7 │
│ 2018-01-02T21:35:00.000Z │ 2818 │ 1 │ 8.8.8.8 │ 12 │ 7.7.7.7 │
└──────────────────────────┴────────┴───────┴─────────┴─────────┴─────────┘
所以由于druid对摄入的数据有聚合的机制,并且也能对接实时流数据,我们可以使用druid对接kafka数据进行聚合。
架构体系图
数据到kafka后,flink只对数据进行预处理,然后再放回kafka,很大程度减少flink的计算量,druid拉去预处理的数据进行聚合,然后到库中,提供查询。
2.创建kafka生产类生产广告数据
1.模型类创建
public class AdClickLog {
//广告ID
private long t_id;
//广告主ID
private long corpuin;
//域名
private String host;
//设备类型
private String device_type;
//广告来源
private String ad_source;
//广告媒介
private String ad_media;
//广告系列
private String ad_compaign;
//城市
private String city;
//点击时间
private String timeStamp;
//用户的ID
private String user_id;
//点击发生时该字段会填入user_id
private String click_user_id;}
2:创建一份模拟生成日志的代码,注意修改最后的请求地址;
import com.alibaba.fastjson.JSONObject;
import com.itheima.report.bean.AdClickLog;
import com.itheima.report.bean.ClickLog;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.util.EntityUtils;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Random;
/**
* 点击流日志模拟器
*/
public class AdClickLogGenerator {
private static Long[] t_ids = new Long[]{1l, 2l, 3l, 4l, 5l, 6l, 7l, 8l, 9l, 10l, 11l, 12l, 13l, 14l, 15l, 16l, 17l, 18l, 19l, 20l};//频道id集合
private static Long[] corpuins = new Long[]{1l, 2l, 3l, 4l, 5l, 6l, 7l, 8l, 9l, 10l, 11l, 12l, 13l, 14l, 15l, 16l, 17l, 18l, 19l, 20l};//产品类别id集合
private static Long[] user_ids = new Long[]{1l, 2l, 3l, 4l, 5l, 6l, 7l, 8l, 9l, 10l, 11l, 12l, 13l, 14l, 15l, 16l, 17l, 18l, 19l, 20l};//用户id集合
/*
host域名
*/
private static String[] hosts = new String[]{"baidu.com", "google"};//地区-国家集合
/**
* 是否是新用户
*/
private static int[] is_news = new int[]{0, 1};
/*
广告来源
*/
private static String[] ad_sources = new String[]{"s1", "s2"};
/*
广告媒介
*/
private static String[] ad_medias = new String[]{"m1", "m2"};
/*
广告系列
*/
private static String[] ad_campagins = new String[]{"风系列", "人生","爱情"};
/**
* 设备类型
*/
private static String[] device_types = new String[]{"pc", "mobile", "other"};
/*
城市
*/
private static String[] citys = new String[]{"beijing", "shanghai", "guangzhou"};
private static Long[] gettimes(String time) {
DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss:SSS");
try {
Date date = dateFormat.parse(time);
long timetemp = date.getTime();
Random random = new Random();
int randomint = random.nextInt(10);
long starttime = timetemp - randomint * 3600 * 1000;
long endtime = starttime + randomint * 3600 * 1000;
return new Long[]{starttime, endtime};
} catch (ParseException e) {
e.printStackTrace();
}
return new Long[]{0l, 0l};
}
/**
* 模拟发送Http请求到上报服务系统
*
* @param url
* @param json
*/
public static void send(String url, String json) {
try {
CloseableHttpClient httpClient = HttpClientBuilder.create().build();
HttpPost post = new HttpPost(url);
JSONObject response = null;
try {
StringEntity s = new StringEntity(json.toString(), "utf-8");
s.setContentEncoding("utf-8");
// 发送json数据需要设置contentType
s.setContentType("application/json");
post.setEntity(s);
HttpResponse res = httpClient.execute(post);
if (res.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
// 返回json格式:
String result = EntityUtils.toString(res.getEntity());
System.out.println(result);
}
} catch (Exception e) {
throw new RuntimeException(e);
}
} catch (Exception e) {
e.printStackTrace();
}
}
public static void main(String[] args) {
SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSZ");
Random random = new Random();
for (int i = 0; i < 10; i++) {
//频道id 类别id 产品id 用户id 打开时间 离开时间 地区 网络方式 来源方式 浏览器
AdClickLog clickLog = new AdClickLog();
clickLog.setCity(citys[random.nextInt(citys.length)]);
clickLog.setAd_compaign(ad_campagins[random.nextInt(ad_campagins.length)]);
clickLog.setAd_media(ad_medias[random.nextInt(ad_medias.length)]);
clickLog.setAd_source(ad_sources[random.nextInt(ad_sources.length)]);
clickLog.setCorpuin(corpuins[random.nextInt(corpuins.length)]);
clickLog.setDevice_type(device_types[random.nextInt(device_types.length)]);
clickLog.setHost(hosts[random.nextInt(hosts.length)]);
clickLog.setT_id(t_ids[random.nextInt(t_ids.length)]);
Date date = new Date();
clickLog.setTimeStamp(df.format(date));
clickLog.setUser_id(user_ids[random.nextInt(user_ids.length)].toString());
//设置点击事件的用户id
if(i%2==0){
clickLog.setClick_user_id(clickLog.getUser_id());
}
String jonstr = JSONObject.toJSONString(clickLog);
System.out.println(jonstr);
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
send("http://localhost:8888/adReceive", jonstr);
}
}
}
3.创建Contorller接收请求并把日志数据发送到kafka中,注意修改kafka中的topic,使用kafka manager创建topic
import com.alibaba.fastjson.JSON;
import com.itheima.report.bean.Message;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.kafka.core.KafkaTemplate;
import org.springframework.web.bind.annotation.RequestBody;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
import java.util.HashMap;
import java.util.Map;
@RestController
@RequestMapping("/")
public class AdReportController {
@Value("adtest")
private String topic;
@Autowired
private KafkaTemplate kafkaTemplate;
@RequestMapping("/adReceive")
public Map receive(@RequestBody String json) {
// 保存返回给前端的结果
Map<String, String> result = new HashMap<>();
try {
// 将所有的数据都封装在一个Message实体类中
Message message = new Message();
// 消息体(点击流消息)
message.setMessage(json);
// 点击的数量
message.setCount(1L);
// 事件时间
message.setTimestamp(System.currentTimeMillis());
// 将实体类转换为JSON字符串
String messageJSON = JSON.toJSONString(message);
System.out.println(messageJSON);
// 发送消息到kafka的指定topic中
kafkaTemplate.send(topic,messageJSON);
result.put("result", "ok");
} catch (Exception e) {
e.printStackTrace();
result.put("result", "failed");
}
return result;
}
}
3.flink数据预处理
1:创建样例类来封装数据
import com.alibaba.fastjson.JSON
import org.apache.commons.lang3.StringUtils
case class AdClickLog(
city:String,
ad_compaign: String,
ad_media: String,
ad_source: String,
corpuin: String,
device_type: String,
host: String,
t_id: String,
user_id: String,
click_user_id: String,
timestamp: String
)
object AdClickLog {
def apply(json: String) = {
//使用FastJSON的JSON.parseObject方法将JSON字符串构建一个ClickLog实例对象
//{\"ad_compaign\":\"风系列\",\"ad_media\":\"m1\",\"ad_source\":\"s2\",\"corpuin\":18,
// \"device_type\":\"mobile\",\"host\":\"google\",\"is_new\":1,\"t_id\":5,\"timestamp\":1559092939197}
val jsonObject = JSON.parseObject(json)
var click_user_id = jsonObject.getString("click_user_id")
if (StringUtils.isBlank(click_user_id)) {
click_user_id = "null"
}
new AdClickLog(
jsonObject.getString("city"),
jsonObject.getString("ad_compaign"),
jsonObject.getString("ad_media"),
jsonObject.getString("ad_source"),
jsonObject.getString("corpuin"),
jsonObject.getString("device_type"),
jsonObject.getString("host"),
jsonObject.getString("t_id"),
jsonObject.getString("user_id"),
click_user_id,
jsonObject.getString("timestamp")
)
}
}
准备好修改GalobalUtil,获取消费的topic,添加flink消费的topic,以及生产消息的topic,在GlobalConfig中获取topic;
先去配置文件中添加topic信息,然后在GlobalConfigUtils中添加获取方法
flink的任务程序
import java.text.SimpleDateFormat
import java.util.Properties
import com.alibaba.fastjson.JSON
import com.itheima.realprocess.bean.AdClickLog
import com.itheima.realprocess.task._
import com.itheima.realprocess.util.GlobalConfigUtil
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.api.scala._
import org.apache.flink.runtime.state.filesystem.FsStateBackend
import org.apache.flink.streaming.api.environment.CheckpointConfig
import org.apache.flink.streaming.api.functions.AssignerWithPeriodicWatermarks
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
import org.apache.flink.streaming.api.watermark.Watermark
import org.apache.flink.streaming.api.{CheckpointingMode, TimeCharacteristic}
import org.apache.flink.streaming.connectors.kafka.{FlinkKafkaConsumer09, FlinkKafkaProducer09}
import org.json4s.DefaultFormats
import org.json4s.native.Serialization.write
object AdApp {
def main(args: Array[String]): Unit = {
// 创建main方法,获取StreamExecutionEnvironment运行环境
val env = StreamExecutionEnvironment.getExecutionEnvironment
// 设置流处理的时间为EventTime,使用数据发生的时间来进行数据处理
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
// 将Flink默认的开发环境并行度设置为1
env.setParallelism(1)
// 保证程序长时间运行的安全性进行checkpoint操作
//
// 5秒启动一次checkpoint
env.enableCheckpointing(5000)
// 设置checkpoint只checkpoint一次
env.getCheckpointConfig.setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE)
// 设置两次checkpoint的最小时间间隔
env.getCheckpointConfig.setMinPauseBetweenCheckpoints(1000)
// checkpoint超时的时长
env.getCheckpointConfig.setCheckpointTimeout(60000)
// 允许的最大checkpoint并行度
env.getCheckpointConfig.setMaxConcurrentCheckpoints(1)
// 当程序关闭的时,触发额外的checkpoint
env.getCheckpointConfig.enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION)
// 设置checkpoint的地址
env.setStateBackend(new FsStateBackend("hdfs://hp101:9000/flink-checkpoints/"))
//
// 整合Kafka
//
val properties = new Properties()
properties.setProperty("bootstrap.servers", GlobalConfigUtil.bootstrapServers)
properties.setProperty("zookeeper.connect", GlobalConfigUtil.zookeeperConnect)
properties.setProperty("group.id", GlobalConfigUtil.groupId)
properties.setProperty("enable.auto.commit", GlobalConfigUtil.enableAutoCommit)
properties.setProperty("auto.commit.interval.ms", GlobalConfigUtil.autoCommitIntervalMs)
// 配置下次重新消费的话,从哪里开始消费
// latest:从上一次提交的offset位置开始的
// earlist:从头开始进行(重复消费数据)
properties.setProperty("auto.offset.reset", GlobalConfigUtil.autoOffsetReset)
// 配置序列化和反序列化
properties.setProperty("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
properties.setProperty("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
val consumer: FlinkKafkaConsumer09[String] = new FlinkKafkaConsumer09[String](
GlobalConfigUtil.adTopic,
new SimpleStringSchema(),
properties
)
val kafkaDataStream: DataStream[String] = env.addSource(consumer)
// 使用map算子,将kafka中消费到的数据
val messageDataStream = kafkaDataStream.map {
msgJson =>
// 使用FastJSON转换为JSON对象
val jsonObject = JSON.parseObject(msgJson)
val count = jsonObject.getLong("count")
val message = jsonObject.getString("message")
val timestamp = jsonObject.getLong("timestamp")
print(message)
// 将JSON的数据解析封装到Message样例类中
// 将数据封装到ClickLog样例类
AdClickLog(message)
}
messageDataStream.print()
// 添加flink的水印处理 , 允许得最大延迟时间是2S
val watermarkDataStream: DataStream[AdClickLog] = messageDataStream.assignTimestampsAndWatermarks(new AssignerWithPeriodicWatermarks[AdClickLog] {
var currentTimestamp: Long = 0L
val maxDelayTime = 2000L
val df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSZ")
var watermark: Watermark = _
// 获取当前的水印
override def getCurrentWatermark = {
watermark = new Watermark(currentTimestamp - maxDelayTime)
watermark
}
// 时间戳抽取操作
override def extractTimestamp(t: AdClickLog, l: Long) = {
val timeStamp = t.timeStamp
currentTimestamp = Math.max(df.parse(timeStamp).getTime, currentTimestamp)
currentTimestamp
}
})
// 在App中调用预处理任务的process方法,并打印测试
val clicklogWideDataStream = PreprocessTask.processAd(watermarkDataStream)
val value: DataStream[String] = clicklogWideDataStream.map {
x =>
implicit val formats = DefaultFormats
val jsonString = write(x)
jsonString
}
value.addSink(
new FlinkKafkaProducer09[String](
GlobalConfigUtil.bootstrapServers,
GlobalConfigUtil.adProcessTopic,
new SimpleStringSchema()
)
)
// clicklogWideDataStream.addSink()
env.execute("RealProcessApp")
}
}
添加process方法对数据进行预处理,主要是判断用户是否为新用户,以及添加点击次数字段
import com.itheima.realprocess.bean.{AdClickLog, AdClickLogWide, ClickLogWide, Message}
import com.itheima.realprocess.util.HBaseUtil
import org.apache.commons.lang3.StringUtils
import org.apache.commons.lang3.time.FastDateFormat
import org.apache.flink.api.scala._
import org.apache.flink.streaming.api.scala.DataStream
/**
* 预处理的任务
*/
object PreprocessTask {
// 包装isNew字段的样例类
case class IsNewWrapper(isNew: Int, isHourNew: Int, isDayNew: Int, isMonthNew: Int)
//处理广告点击数据,增加isNew字段
def processAd(watermarkDataStream: DataStream[AdClickLog]) = {
watermarkDataStream.map {
msg =>
val isNew = analysisNew(msg)
var click_cnt = 0
if (!msg.click_user_id.equalsIgnoreCase("null")) {
click_cnt = 1
}
AdClickLogWide(
msg.city,
msg.ad_compaign,
msg.ad_media,
msg.ad_source,
msg.corpuin,
msg.device_type,
msg.host,
msg.t_id,
msg.user_id,
msg.click_user_id,
msg.timeStamp,
isNew,
click_cnt
)
}
}
/*
判断用户是否为新用户
*/
def analysisNew(adlog: AdClickLog) = {
//先把要拓宽的字段isNew、isHourNew、isDayNew、isMonthNew都创建出来,初始化为0
var isNew = 0
// 封装操作hbase需要的字段
val tableName = "user_history"
val rowkey = adlog.user_id
val cfName = "info"
val userIdColName = "userId" // 用户ID
//从hbase查询rowkey为userid:channlid查询user_history中userid列的数据
val userIdInHBase = HBaseUtil.getData(tableName, rowkey, cfName, userIdColName)
//判断userid列数据是否为空
if (StringUtils.isBlank(userIdInHBase)) {
//如果为空
//设置isNew字段为1,表示是新用户,
isNew = 1
//将该用户数据添加到user_history表中
HBaseUtil.putMapData(tableName, rowkey, cfName, Map(
userIdColName -> adlog.user_id
))
}
isNew
}
}
三 :druid实时摄入数据开发
1:编写kafkadruidinex摄入文件
{
"type": "kafka",
"dataSchema": {
"dataSource": "adclicklog",
"parser": {
"type": "string",
"parseSpec": {
"format": "json",
"timestampSpec": {
"column": "timestamp",
"format": "auto"
},
"dimensionsSpec": {
"dimensions": [ ],
"dimensionExclusions": [
"timestamp",
"is_new",
"click_cnt"
]
}
}
},
"metricsSpec": [
{
"name": "count",
"type": "count"
},
{
"name": "click_cnt",
"fieldName": "click_cnt",
"type": "longSum"
},
{
"name": "new_cnt",
"fieldName": "is_new",
"type": "longSum"
},
{
"name": "uv",
"fieldName": "user_id",
"type": "thetaSketch",
"isInputThetaSketch": "false",
"size": "16384"
},
{
"name": "click_uv",
"fieldName": "click_user_id",
"type": "thetaSketch",
"isInputThetaSketch": "false",
"size": "16384"
}
],
"granularitySpec": {
"type": "uniform",
"segmentGranularity": "DAY",
"queryGranularity": "NONE",
"rollup": false
}
},
"tuningConfig": {
"type": "kafka",
"reportParseExceptions": false
},
"ioConfig": {
"topic": "ad_click_process",
"replicas": 1,
"taskDuration": "PT10M",
"completionTimeout": "PT20M",
"consumerProperties": {
"bootstrap.servers": "hp101:9092,hp102:9092"
}
}
}
2:提交kafka索引任务
curl -X POST -H 'Content-Type: application/json' -d @kafka-index-adclicklog.json http://hp101:8090/druid/indexer/v1/supervisor
3.查询数据
{
"queryType":"timeseries",
"dataSource":"adclicklog",
"granularity":{"type": "period", "period": "PT1H", "timeZone": "Asia/Shanghai"},
"aggregations":[
{
"type":"longSum",
"name":"click",
"fieldName":"click_cnt"
},{
"type":"longSum",
"name":"pv",
"fieldName":"count"
}
],
"intervals":["2019-06-01/2019-06-30"]
}