我们的项目需求是spark任务处理完的数据发送至kafka中然后使用flume将数据根据表名写入hdfs中并且要求写入的文件格式是压缩格式。
flume是有自定义Sink这一说的,所以我们这里需要用到flume指定Sink这一技术,这里我们就需要用到的flume是1.7版本的
<dependency>
<groupId>org.apache.flume</groupId>
<artifactId>flume-ng-core</artifactId>
<version>1.7.0</version>
</dependency>
这个是我的maven依赖
以下是我的flume properties.properties配置
server.sources = kafka_test
server.channels = channel_test
server.sinks = hdfs_test
server.sources.kafka_test.type = org.apache.flume.source.kafka.KafkaSource
server.sources.kafka_test.kafka.topics = flume_test
server.sources.kafka_test.montime =
server.sources.kafka_test.nodatatime = 0
server.sources.kafka_test.kafka.topics.regex =
server.sources.kafka_test.kafka.consumer.group.id = test1
server.sources.kafka_test.kafka.bootstrap.servers = kafkaip:端口
server.sources.kafka_test.kafka.security.protocol = PLAINTEXT
server.sources.kafka_test.batchDurationMillis = 1000
server.sources.kafka_test.batchSize = 1000
server.sources.kafka_test.channels = channel_test
server.channels.channel_test.type = memory
server.channels.channel_test.capacity = 10000
server.channels.channel_test.transactionCapacity = 10000
server.channels.channel_test.channelfullcount = 10
server.channels.channel_test.keep-alive = 3
server.channels.channel_test.byteCapacity =
server.channels.channel_test.byteCapacityBufferPercentage = 20
server.sinks.hdfs_test.type = com.kdriving.dataprocessor.flume.MySink
server.sinks.hdfs_test.channel = channel_test
为什么要自定义呢?
因为kafka的一个topic中包含了多样的数据并不只是一种,而且这些数据我们是根据不同的表存储到hdfs不同的目录当中方便以后hive建表的时候关联。
接下来看一下代码:
package com.kdriving.dataprocessor.flume;
import java.io.*;
import java.net.URI;
import java.text.SimpleDateFormat;
import java.util.*;
import com.alibaba.fastjson.JSONObject;
import com.google.common.base.Throwables;
import com.kd.common.application.AppInfo;
import com.kd.common.log.Logger;
import com.kd.common.log.LoggerFactory;
import org.apache.flume.*;
import org.apache.flume.conf.Configurable;
import org.apache.flume.instrumentation.SinkCounter;
import org.apache.flume.sink.AbstractSink;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionOutputStream;
import org.apache.hadoop.io.compress.zlib.BuiltInZlibDeflater;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.kafka.clients.producer.Producer;
public class MySink extends AbstractSink implements Configurable {
private static final Logger LOGGER = LoggerFactory.getLogger(MySink.class);
private static String nameservices = AppInfo.instance.getConf().getProperty("datacenter.base.hdfs.nameservices");
private static String namenodes = AppInfo.instance.getConf().getProperty("datacenter.base.hdfs.namenodes");
private static String namenodesAddr = AppInfo.instance.getConf().getProperty("datacenter.base.hdfs.namenodesAddr");
private String hdfsURI;
private static final long defaultRollInterval = 30;
private String username;
private String dataDir;
private String dateFormat;
private URI uri;
private static Configuration conf = new Configuration();
private FSDataOutputStream out = null;
private Properties parameters;
private Producer<String, String> producer;
// private Context context;
private int batchSize = 1000;// 一次事务的event数量,整体提交
private SinkCounter sinkCounter;
//初始化hdfs
static {
String[] namenodesArray = namenodes.split(",");
String[] namenodesAddrArray = namenodesAddr.split(",");
conf.set("fs.defaultFS", "hdfs://" + nameservices);
conf.set("dfs.nameservices", nameservices);
conf.set("dfs.ha.namenodes." + nameservices, namenodesArray[0] + "," + namenodesArray[1]);
conf.set("dfs.namenode.rpc-address.hacluster." + namenodesArray[0], namenodesAddrArray[0]);
conf.set("dfs.namenode.rpc-address.hacluster." + namenodesArray[1], namenodesAddrArray[1]);
conf.set("dfs.client.failover.proxy.provider." + nameservices, "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider");
conf.setBoolean("dfs.support.append", true);
conf.set("dfs.client.block.write.replace-datanode-on-failure.policy", "DEFAULT");
conf.setBoolean("dfs.client.block.write.replace-datanode-on-failure.enable", true);
}
//数据处理的逻辑都在process方法中实现
@Override
public Status process() throws EventDeliveryException {
LOGGER.info("开始执行mysink");
// TODO Auto-generated method stub
Status result = Status.READY;
Channel channel = getChannel();
Transaction transaction = null;
Event event = null;
String[] tablearrays = new String[]{"RADAR_RADAR_VALID", "RADAR_COMPREHENSIVE_VALID"};
try {
//要选择hdfs的压缩格式,可以从hdfs core-site.xml的配置项io.compression.codecs中查看hdfs支持的压缩格式
String CodecClass = "org.apache.hadoop.io.compress.BZip2Codec";
Class<?> codecClass = Class.forName(CodecClass);
CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
long processedEvent = 0;
transaction = channel.getTransaction();
transaction.begin();// 事务开始
List<String> radarList = new ArrayList<>();
List<String> COMPREHENSIVEList = new ArrayList<>();
Boolean b = true;
Map<String, List<String>> tableMap = new HashMap<>();
while (b) {
event = channel.take();// 从channel取出一个事件
if (event == null) {
//LOGGER.info("event为空");
result = Status.BACKOFF;
break;
} else {
byte[] eventBody = event.getBody();
String eventData = new String(eventBody, "UTF-8");
// LOGGER.info("event有数据-》" + eventData);
}
sinkCounter.incrementEventDrainAttemptCount();
// Map<String, String> headers = event.getHeaders();
byte[] eventBody = event.getBody();
String eventData = new String(eventBody, "UTF-8");
JSONObject jsonObject = JSONObject.parseObject(eventData);
String table = jsonObject.getString("TABLE");
if (table.equals(tablearrays[0])) {
radarList.add(eventData);
} else if (table.equals(tablearrays[1])) {
COMPREHENSIVEList.add(eventData);
}
if (radarList.size() > 1000) {
b = false;
} else if (COMPREHENSIVEList.size() > 1000) {
b = false;
}
processedEvent++;
}
tableMap.put(tablearrays[0], radarList);
tableMap.put(tablearrays[1], COMPREHENSIVEList);
LOGGER.info("messageList大小" + radarList.size());
if (processedEvent == 0) {
LOGGER.info("processedEvent=0");
sinkCounter.incrementBatchEmptyCount();
result = Status.BACKOFF;
} else {
if (processedEvent < batchSize) {
sinkCounter.incrementBatchUnderflowCount();
} else {
sinkCounter.incrementBatchCompleteCount();
}
sinkCounter.addToEventDrainAttemptCount(processedEvent);
for (Map.Entry<String, List<String>> entry : tableMap.entrySet()) {
FSDataOutputStream outputStream = creatoutputStream(entry.getKey());
CompressionOutputStream cout = codec.createOutputStream(outputStream);
for (String body : entry.getValue()) {
String clo = "id|row|RSPID|LONGITUDE|CreateTime|TimeMin1|flightId|GroudSpeed|TRACKID|MsgType|REGID|TimeStamp|DataSource|fusion|DepAP|CallSign|TimeMin5|VerticalRate|SOURCE|ArrAP|DataType|Height|Vector|LATITUDE|Altitude|";
String[] split = clo.split("\\|");
String data = new String(body.getBytes("UTF-8"));
JSONObject jsonObject = JSONObject.parseObject(data);
jsonObject.put("row", jsonObject.getString("id"));
StringBuilder stringBuilder = new StringBuilder();
for (String s : split) {
String string = jsonObject.getString(s);
if (string == null || string.equals("")) {
string = "null";
}
stringBuilder.append(string).append("|");
}
LOGGER.info("HDFS正在写入新数据->" + stringBuilder);
String string = stringBuilder.toString();
cout.write(string.getBytes("UTF-8"));
cout.write("\r\n".getBytes("UTF-8"));
cout.flush();
}
cout.close();
outputStream.close();
}
}
transaction.commit();// batchSize个事件处理完成,一次事务提交
sinkCounter.addToEventDrainSuccessCount(processedEvent);
result = Status.READY;
} catch (Exception e) {
String errorMsg = "Failed to publish events !";
LOGGER.error(errorMsg, e);
e.printStackTrace();
result = Status.BACKOFF;
if (transaction != null) {
try {
transaction.rollback();
LOGGER.debug("transaction rollback success !");
} catch (Exception ex) {
LOGGER.error(errorMsg, ex);
throw Throwables.propagate(ex);
}
}
// throw new EventDeliveryException(errorMsg, e);
} finally {
if (transaction != null) {
transaction.close();
}
}
return result;
}
public FSDataOutputStream creatoutputStream(String table) throws Exception {
String radar_radar_validFile = "";
String date = new SimpleDateFormat("yyyy-MM-dd").format(new Date());
Calendar calendar = Calendar.getInstance();
int year = calendar.get(Calendar.YEAR);
int moth = calendar.get(Calendar.MONTH) + 1;
int day = calendar.get(Calendar.DATE);
String yearPath = "/data/" + table + "/" + year;
String monthPath = "/data/" + table + "/" + year + "/" + moth;
String dayPath = "/data/" + table + "/" + year + "/" + moth + "/" + day;
Path yearfilePath = new Path(yearPath);
Path monthfilePath = new Path(monthPath);
Path dayfilePath = new Path(dayPath);
radar_radar_validFile = "/data/" + table + "/" + year + "/" + moth + "/" + day + "/" + date + "-" + table + ".bz2";
Path filePath = new Path(radar_radar_validFile);
FileSystem hdfs = filePath.getFileSystem(conf);
if (!hdfs.exists(yearfilePath)) {
hdfs.mkdirs(yearfilePath);
LOGGER.info(yearfilePath.toString() + "路径不存在已创建成功");
}
if (!hdfs.exists(monthfilePath)) {
hdfs.mkdirs(monthfilePath);
LOGGER.info(monthfilePath.toString() + "路径不存在已创建成功");
}
if (!hdfs.exists(dayfilePath)) {
hdfs.mkdirs(dayfilePath);
LOGGER.info(dayfilePath.toString() + "路径不存在已创建成功");
}
if (!hdfs.exists(filePath)) {
hdfs.createNewFile(filePath);
LOGGER.info(filePath.toString() + "文件不存在已创建成功");
}
return hdfs.append(new Path(radar_radar_validFile));
}
//该方法用于读取Flume中Sink的配置,在Sink初始化时调用
@Override
public void configure(Context context) {
if (sinkCounter == null) {
sinkCounter = new SinkCounter(getName());
}
// customelog.sinks.sink1.type=death.flume.FlumeSinkDemo
// customelog.sinks.sink1.channel=channel1
// customelog.sinks.sink1.hdfsURI=hdfs://hostname:port
// customelog.sinks.sink1.username=hdfs
// customelog.sinks.sink1.dataDir=/death/data_sampling
// customelog.sinks.sink1.dateFormat=YYYY-MM-dd
}
//该方法用于Sink启动时调用
@Override
public synchronized void start() {
sinkCounter.start();
sinkCounter.incrementConnectionCreatedCount();
super.start();
// try {
// uri = new URI(hdfsURI);
// conf = new Configuration();
// }catch (Exception e){
// e.printStackTrace();
// }
}
//该方法用于Sink停止使用调用
@Override
public synchronized void stop() {
sinkCounter.stop();
sinkCounter.incrementConnectionClosedCount();
super.stop();
}
}
将项目jar包存放至flume的lib目录下面!!!然后执行properties.properties配置文件,我们使用的是FusionInsight Manager来启动fume任务,
选择我们要执行flume任务的节点
然后上传properties.properties文件,然后点击保存配置
然后通过hue检查文件是否存在hdfs上面
文件成功存储到该路径下
然后再使用hive外部表关联至目录下
CREATE EXTERNAL TABLE IF NOT EXISTS `RADAR_RADARAIRLINE_VALID`( `id` STRING ,
`row` STRING ,
`RSPID` STRING ,
`LONGITUDE` STRING ,
`CreateTime` STRING ,
`TimeMin1` STRING ,
`flightId` STRING ,
`GroudSpeed` STRING ,
`TRACKID` STRING ,
`MsgType` STRING ,
`REGID` STRING ,
`TimeStamp` STRING ,
`DataSource` STRING ,
`fusion` STRING ,
`DepAP` STRING ,
`CallSign` STRING ,
`TimeMin5` STRING ,
`VerticalRate` STRING ,
`SOURCE` STRING ,
`ArrAP` STRING ,
`DataType` STRING ,
`Height` STRING ,
`Vector` STRING ,
`LATITUDE` STRING ,
`Altitude` STRING
)
PARTITIONED BY (`dt` string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 'hdfs://namenodeip:hdfs端口/data/radar_radar_valid/2020/07/23'
关联成功后查询数据
查询成功!!!
当然这个中间有很多波折,,其实所谓的经验就是在一点点的学习中慢慢积累起来的!!!借此博客记录一下这次经典的flume结合hdfs,hive的使用