文章目录
一 使用flume将日志文件收集到HDFS
logger server – flume读数据 – kafka – flume – hdfs
第一层flume:taildir source – memory channel(或者file channel) – kafka sink 【传统架构】【√】
taildir source – kafka channel 【使用kafka channel】
第二层flume:kafka source – memory channel(或者file channel) – hdfs sink 【传统架构】【√】需要添加拦截器,如果没有source,没有办法添加拦截器
kafka channel – hdfs sink 【使用kafka channel】
1 第一层flume实现过程(采集日志flume)
在实现第一层flume之前,需要进行数据清洗,将读取到的不符合规则的数据去除掉
(1)java实现过程
创建maven工程,编辑依赖信息(pom.xml)
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.hike.gmall</groupId>
<artifactId>Colllect</artifactId>
<version>1.0-SNAPSHOT</version>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>7</source>
<target>7</target>
</configuration>
</plugin>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>2.3.2</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>org.apache.flume</groupId>
<artifactId>flume-ng-core</artifactId>
<version>1.9.0</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.62</version>
<scope>compile</scope>
</dependency>
</dependencies>
</project>
编辑代码
package com.hike.gmall.interceptor;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONException;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.interceptor.Interceptor;
import java.nio.charset.StandardCharsets;
import java.util.Iterator;
import java.util.List;
public class ETLLogInterceptor implements Interceptor{
public void initialize() {
}
public Event intercept(Event event) {
// 1 取出body
String body = new String(event.getBody(), StandardCharsets.UTF_8);
// 2 通过阿里的fastjson判断数据是否完整
try{
JSON.parseObject(body);
}catch (JSONException e){
return null;
}
return event;
}
public List<Event> intercept(List<Event> list) {
Iterator<Event> iterator = list.iterator();
//增强for循环在遍历过程中不能够移除元素
//需要使用迭代器的方式
while(iterator.hasNext()){
Event event = iterator.next();
Event result = intercept(event);
if(result == null){
iterator.remove();
}
}
return list;
}
public void close() {
}
public static class MyBuilder implements Builder{
@Override
public Interceptor build() {
return new ETLLogInterceptor();
}
@Override
public void configure(Context context) {
}
}
}
打包,选择带依赖的jar包 Colllect-1.0-SNAPSHOT-jar-with-dependencies.jar
上传到/opt/module/flume-1.9.0/lib目录下
(2)编写配置信息
a1.sources = r1
a1.channels = c1
a1.sources.r1.type = TAILDIR
a1.sources.r1.filegroups = f1
a1.sources.r1.filegroups.f1 = /opt/module/applog/log/app.*
a1.sources.r1.positionFile = /opt/module/flume-1.9.0/jobs/position/position.json
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = com.hike.gmall.interceptor.ETLLogInterceptor$MyBuilder
a1.channels.c1.type = org.apache.flume.channel.kafka.KafkaChannel
a1.channels.c1.kafka.bootstrap.servers = hadoop101:9092,hadoop102:9092,hadoop103:9092
a1.channels.c1.kafka.topic = topic_log
a1.channels.c1.parseAsFlumeEvent = false
a1.sources.r1.channels = c1
/opt/module/flume-1.9.0/jobs/gmall
vim logserver-flume-kafka.conf
#启动消费者
kafka-console-consumer.sh --topic topic_log --bootstrap-server hadoop101:9092
#启动kafka,查看消费者是否采集到数据
flume-ng agent -c $FLUME_HOME/conf -f $FLUME_HOME/jobs/gmall/logserver-flume-kafka.conf -n a1 -Dflume.root.logger=INFO,console
#可以再执行log脚本生产一些数据,查看是否可以采集到
(3)第一层flume起停脚本
#!/bin/bash
if [ $# -lt 1 ]
then
echo "USAGE: f1.sh {start|stop}"
exit
fi
case $1 in
start)
for i in hadoop101 hadoop102
do
ssh $i "nohup flume-ng agent -c $FLUME_HOME/conf -f $FLUME_HOME/jobs/gmall/logserver-flume-kafka.conf -n a1 -Dflume.root.logger=INFO,console 1>$FLUME_HOME/logs/flume.log 2>&1 &"
done
;;
stop)
for i in hadoop101 hadoop102
do
ssh $i "ps -ef | grep logserver-flume-kafka.conf | grep -v grep | awk '{print \$2}' | xargs -n1 kill -9"
done
;;
*)
echo "USAGE: f1.sh {start|stop}"
exit
;;
esac
/opt/module/flume-1.9.0
mkdir logs
cd ~/bin
vim f1.sh
chmod u+x f1.sh
scp -r /opt/module/flume-1.9.0/ hadoop102:/opt/module/
scp /etc/profile.d/my_env.sh root@hadoop102:/etc/profile.d/
2 第二层flume实现过程(消费kafka数据flume)
第二层flume:kafka source – memory channel(或者file channel) – hdfs sink 【传统架构】【√】需要添加拦截器,如果没有source,没有办法添加拦截器
kafka channel – hdfs sink 【使用kafka channel】
为保证用户行为产生事件就是日志的生成时间(因其在传输过程中也会耗费时间),所以需要在kafka source中添加时间戳,不使用本地时间戳。
(1)拦截器实现过程
package com.hike.gmall.interceptor;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.interceptor.Interceptor;
import java.nio.charset.StandardCharsets;
import java.util.List;
public class TimeStampInterceptor implements Interceptor {
@Override
public void initialize() {
}
@Override
public Event intercept(Event event) {
// 1 取出body
String body = new String(event.getBody(), StandardCharsets.UTF_8);
// 2 将JSON解析成对象
JSONObject jsonObject = JSON.parseObject(body);
// 3 从对象中获取ts
String ts = jsonObject.getString("ts");
// 4 将ts的值设置到event的header中
event.getHeaders().put("timestamp",ts);
return event;
}
@Override
public List<Event> intercept(List<Event> list) {
for (Event event : list) {
intercept(event);
}
return list;
}
@Override
public void close() {
}
public static class MyBuilder implements Builder{
@Override
public Interceptor build() {
return new TimeStampInterceptor();
}
@Override
public void configure(Context context) {
}
}
}
(2)配置文件实现过程
将第二层flume配置到hadoop103上
scp -r /opt/module/flume-1.9.0/ hadoop103:/opt/module/
scp /etc/profile.d/my_env.sh root@hadoop103:/etc/profile.d/
#在hadoop104进行操作
/opt/module/flume-1.9.0/lib
rm -rf Colllect-1.0-SNAPSHOT-jar-with-dependencies.jar
#上传jar包
/opt/module/flume-1.9.0/jobs
mkdir filechannel
mkdir checkpoint
a1.sources = r1
a1.channels = c1
a1.sinks = k1
a1.sources.r1.type = org.apache.flume.source.kafka.KafkaSource
a1.sources.r1.kafka.bootstrap.servers = hadoop101:9092,hadoop102:9092,hadoop103:9092
a1.sources.r1.kafka.topics = topic_log
a1.sources.r1.kafka.consumer.group.id = gmall
a1.sources.r1.batchDurationMillis = 2000
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = com.hike.gmall.interceptor.TimeStampInterceptor$MyBuilder
a1.channels.c1.type = file
a1.channels.c1.dataDirs = /opt/module/flume-1.9.0/jobs/filechannel
#内存中保存断点续传指针的容量
a1.channels.c1.capacity = 1000000
#将内存中数据落盘,防止数据丢失
a1.channels.c1.checkpointDir = /opt/module/flume-1.9.0/jobs/checkpoint
#a1.channels.c1.useDualCheckpoints = true
#a1.channels.c1.backupCheckpointDir = /opt/module/flume-1.9.0/jobs/checkpoint-bk
a1.channels.c1.transactionCapacity = 10000
a1.channels.c1.maxFileSize = 2146435071
a1.channels.c1.keep-alive = 5
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = /origin_data/gmall/log/topic_log/%Y-%m-%d
a1.sinks.k1.hdfs.filePrefix = log-
a1.sinks.k1.hdfs.round = false
a1.sinks.k1.hdfs.rollInterval = 10
a1.sinks.k1.hdfs.rollSize = 134217728
a1.sinks.k1.hdfs.rollCount = 0
## 控制输出文件是原生文件。
a1.sinks.k1.hdfs.fileType = CompressedStream
a1.sinks.k1.hdfs.codeC = lzop
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
/opt/module/flume-1.9.0/jobs/gmall
vim kafka-flume-hdfs.conf
flume-ng agent -c $FLUME_HOME/conf -f $FLUME_HOME/jobs/gmall/kafka-flume-hdfs.conf -n a1 -Dflume.root.logger=INFO,console
#完成之后可以在hdfs端查看到收集的文件
(3)第二层起停脚本
#!/bin/bash
if [ $# -lt 1 ]
then
echo "USAGE: f2.sh {start|stop}"
exit
fi
case $1 in
start)
for i in hadoop103
do
ssh $i "nohup flume-ng agent -c $FLUME_HOME/conf -f $FLUME_HOME/jobs/gmall/kafka-flume-hdfs.conf -n a1 -Dflume.root.logger=INFO,console 1>$FLUME_HOME/logs/flume.log 2>&1 &"
done
;;
stop)
for i in hadoop103
do
ssh $i "ps -ef | grep kafka-flume-hdfs.conf | grep -v grep | awk '{print \$2}' | xargs -n1 kill -9"
done
;;
*)
echo "USAGE: f2.sh {start|stop}"
exit
;;
esac