项目架构
- 手机APP运行时,上报启动日志、错误日志、页面日志、事件日志、使用时长日志等信息到日志收集服务器
- 日志收集服务器将收集到的日志信息发送给kafka
- flume分别消费kafka中的5种主题信息,并把数据存储到hdfs中
- 通过任务调度定时把hdfs中的信息拷贝到hive数据仓库中
- 核心业务操作采用hive查询,把查询结果显示到前端页面
flume可以创建拦截器
添加maven依赖
<dependencies>
<dependency>
<groupId>org.apache.flume</groupId>
<artifactId>flume-ng-core</artifactId>
<version>1.7.0</version>
</dependency>
</dependencies>
创建拦截器,根据收集的日志信息关键字,将日志类型存储到flume头中,以便进一步将日志发送到hdfs的不同位置
public class LogCollInterceptor implements Interceptor {
private final boolean preserveExisting;
private LogCollInterceptor(boolean preserveExisting) {
this.preserveExisting = preserveExisting;
}
public void initialize() {
}
public Event intercept(Event event) {
// 1获取flume接收消息头
Map<String, String> headers = event.getHeaders();
// 2获取flume接收的json数据数组
byte[] json = event.getBody();
// 将json数组转换为字符串
String jsonStr = new String(json);
System.out.println(jsonStr);
// pageLog
String logType = "" ;
if(jsonStr.contains("pageId")){
logType = "page" ;
}
// eventLog
else if (jsonStr.contains("eventId")) {
logType = "event";
}
// usageLog
else if (jsonStr.contains("singleUseDurationSecs")) {
logType = "usage";
}
// error
else if (jsonStr.contains("errorBrief")) {
logType = "error";
}
// startup
else if (jsonStr.contains("network")) {
logType = "startup";
}
// 3将日志类型存储到flume头中
headers.put("logType", logType);
return event;
}
public List<Event> intercept(List<Event> events) {
Iterator i$ = events.iterator();
while(i$.hasNext()) {
Event event = (Event)i$.next();
this.intercept(event);
}
return events;
}
public void close() {
}
public static class Constants {
public static String TIMESTAMP = "timestamp";
public static String PRESERVE = "preserveExisting";
public static boolean PRESERVE_DFLT = false;
public Constants() {
}
}
public static class Builder implements Interceptor.Builder {
private boolean preserveExisting;
public Builder() {
this.preserveExisting = LogCollInterceptor.Constants.PRESERVE_DFLT;
}
public Interceptor build() {
return new LogCollInterceptor(this.preserveExisting);
}
public void configure(Context context) {
this.preserveExisting = context.getBoolean(LogCollInterceptor.Constants.PRESERVE, LogCollInterceptor.Constants.PRESERVE_DFLT).booleanValue();
}
}
}
flume conf的添加以下内容
a1.sources=r1
a1.channels=c1
a1.sinks=k1
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = com.atguigu.app.flume.interceptor.LogCollInterceptor$Builder
a1.sources.r1.type = org.apache.flume.source.kafka.KafkaSource
a1.sources.r1.batchSize = 5000
a1.sources.r1.batchDurationMillis = 2000
a1.sources.r1.kafka.bootstrap.servers = hadoop102:9092
a1.sources.r1.kafka.zookeeperConnect = hadoop102:2181,hadoop103:2181,hadoop104:2181
a1.sources.r1.kafka.topics=topic_app_startup,topic_app_error,topic_app_event,topic_app_usage,topic_app_page
a1.channels.c1.type=memory
a1.channels.c1.capacity=100000
a1.channels.c1.transactionCapacity=10000
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = /user/centos/applogs/%{logType}/%Y%m/%d/%H%M
a1.sinks.k1.hdfs.filePrefix = events-
a1.sinks.k1.hdfs.round = true
a1.sinks.k1.hdfs.roundValue = 30
a1.sinks.k1.hdfs.roundUnit = second
#不要产生大量小文件
a1.sinks.k1.hdfs.rollInterval = 30
a1.sinks.k1.hdfs.rollSize = 0
a1.sinks.k1.hdfs.rollCount = 0
#控制输出文件是原生文件。
a1.sinks.k1.hdfs.fileType = DataStream
a1.sources.r1.channels = c1
a1.sinks.k1.channel= c1
hive创建分区表
CREATE external TABLE ext_startup_logs(createdAtMs bigint,appId string,tenantId string,deviceId string,appVersion string,appChannel string,appPlatform string,osType string,deviceStyle string,country string,province string,ipAddress string,network string,carrier string,brand string,screenSize string)PARTITIONED BY (ym string, day string,hm string) ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe' STORED AS TEXTFILE;
写一个脚本每隔一分钟将数据从hdfs中导入到hive中,接着就能使用sql语句来查询统计
github地址