HttpSource -> HDFS Sink
这里采用了直接存入HDFS,并在在Hive 建外部表 映射到同一地址的方法间接存入Hive库, 其中HDFS Sink 的 配置参数可参考 http://lxw1234.com/archives/2015/10/527.htm
配置文件
a1.sources=r1
a1.sinks=k1
a1.channels=c1
a1.sources.r1.type=http
a1.sources.r1.bind=localhost
a1.sources.r1.port=50000
a1.sources.r1.channels=c1
a1.sources.r1.handler=com.test.flume.PlainJSONHandler
a1.sources.r1.interceptors = i1 i2
a1.sources.r1.interceptors.i1.preserveExisting = true
a1.sources.r1.interceptors.i1.type = timestamp
a1.sources.r1.interceptors.i2.type = host
a1.sources.r1.interceptors.i2.hostHeader = hostname
a1.sinks.k1.channel=c1
a1.sinks.k1.type=hdfs
#a1.sinks.k1.hdfs.path = hdfs://ip:port/user/hive/warehouse/flume_test
a1.sinks.k1.hdfs.path=/user/flume/testHive/day=%Y%m%d/
a1.sinks.k1.serializer = com.test.flume.HiveSerializer$Builder
a1.sinks.k1.hdfs.writeFormat = Text
a1.sinks.k1.hdfs.fileType = DataStream
## file roll
a1.sinks.k1.hdfs.filePrefix = log_%Y%m%d_%H
a1.sinks.k1.hdfs.useLocalTimeStamp = true
a1.sinks.k1.hdfs.minBlockReplicas=1
a1.sinks.k1.hdfs.rollInterval = 3600
a1.sinks.k1.hdfs.rollCount = 0
a1.sinks.k1.hdfs.rollSize = 128000000
a1.sinks.k1.hdfs.batchSize = 100
# close file time
a1.sinks.k1.hdfs.idleTimeout=60
a1.channels.c1.type=memory
a1.channels.c1.capacity=1000
a1.channels.c1.transactionCapacity=100
注:
(1) 默认的 serializer 只会往外写body里的数据,想要 加入其他信息(如timestamp 和 ip )需自定义 serializer,实现 EventSerializer 并重写 其 write 方法即可xiang
(2) 文件滚动时需要对文件名自定义,否则一次请求会产生一个文件,这里的文件滚动设置为60秒新增一个文件,也可基于其他配置(如rollcount - event 个数 和 rollsize - 文件大小)进行滚动
(3) 如果想写入Hive 库,向外写出时, 需保证写出顺序和Hive 表的字段顺序一致
(4) 目录采用 day=xxx 是为了匹配 Hive 表的分区
HiveSerializer
package com.test.flume;
import java.io.IOException;
import java.io.OutputStream;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.interceptor.TimestampInterceptor;
import org.apache.flume.serialization.EventSerializer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSON;
import com.test.flume.vo.AppsFlyerLogVo;
public class HiveSerializer implements EventSerializer{
private final static Logger logger = LoggerFactory.getLogger(HiveSerializer.class);
public static final String FORMAT = "format";
public static final String REGEX = "regex";
public static final String REGEX_ORDER = "regexorder";
// private final String DEFAULT_FORMAT = "CSV";
// private final String format;
private final OutputStream out;
private HiveSerializer(OutputStream out, Context ctx) {
// this.format = ctx.getString(FORMAT, DEFAULT_FORMAT);
// if (!format.equals(DEFAULT_FORMAT)){
// logger.warn("Unsupported output format" + format + ", using default instead");
// }
this.out = out;
}
public void write(Event event) throws IOException {
// first write out the timestamp
String timestamp = event.getHeaders().get(TimestampInterceptor.Constants.TIMESTAMP);
if (timestamp == null || timestamp.isEmpty()){
long now = System.currentTimeMillis();
timestamp = Long.toString(now);
}
String bodyStr = new String(event.getBody());
AppsFlyerLogVo vo = JSON.parseObject(bodyStr, AppsFlyerLogVo.class);
logger.info("======== Serializer ========= " + vo.toString() + " ================== ");
writeLogVo(vo);
out.write('\n');
}
private void writeLogVo(AppsFlyerLogVo vo) {
try {
// 根据 hive 表字段顺序
writeFiled(vo.getDeviceModel());
writeFiled(vo.getDownloadTimeSelectedTimezone());
writeFiled(vo.getDownloadTime());
writeFiled(vo.getOperator());
writeFiled(vo.getIp());
writeFiled(vo.getAppName());
writeFiled(vo.getCity());
writeFiled(vo.getCustomerUserId());
writeFiled(vo.getInstallTimeSelectedTimezone());
writeFiled(vo.getEventName());
writeFiled(vo.getEventTimeSelectedTimezone());
writeFiled(vo.getIsRetargeting());
writeFiled(vo.getInstallTime());
writeFiled(vo.getEventTime());
writeFiled(vo.getPlatform());
writeFiled(vo.getSdkVersion());
writeFiled(vo.getAppsflyerDeviceId());
writeFiled(vo.getSelectedCurrency());
writeFiled(vo.getWifi());
writeFiled(vo.getAdvertisingId());
writeFiled(vo.getMediaSource());
writeFiled(vo.getCountryCode());
writeFiled(vo.getBundleId());
writeFiled(vo.getCarrier());
writeFiled(vo.getLanguage());
writeFiled(vo.getAppId());
writeFiled(vo.getAppVersion());
writeFiled(vo.getAttributionType());
writeFiled(vo.getOsVersion());
writeFiled(vo.getDeviceBrand());
writeFiled(vo.getEventType());
} catch (Exception e) {
e.printStackTrace();
}
}
private void writeFiled(String filed){
try {
if (filed != null) {
out.write(filed.getBytes());
out.write('\u0002');
}else {
out.write('\u0002');
}
} catch (IOException e) {
e.printStackTrace();
}
}
public void afterCreate() throws IOException {
// TODO Auto-generated method stub
}
public void afterReopen() throws IOException {
// TODO Auto-generated method stub
}
public void flush() throws IOException {
// TODO Auto-generated method stub
}
public void beforeClose() throws IOException {
// TODO Auto-generated method stub
}
public boolean supportsReopen() {
// TODO Auto-generated method stub
return false;
}
public EventSerializer build(Context context, OutputStream out) {
// TODO Auto-generated method stub
return null;
}
public static class Builder implements EventSerializer.Builder {
public EventSerializer build(Context context, OutputStream out) {
HiveSerializer s = new HiveSerializer(out, context);
return s;
}
}
}
Hive 建表
DROP TABLE flume_test2;
CREATE EXTERNAL TABLE IF NOT EXISTS flume_test2(
device_model string,
download_time_selected_timezone string,
download_time string,
operator string,
ip string,
app_name string,
city string,
customer_user_id string,
install_time_selected_timezone string,
event_name string,
event_time_selected_timezone string,
is_retargeting string,
install_time string,
event_time string,
platform string,
sdk_version string,
appsflyer_device_id string,
selected_currency string,
wifi string,
advertising_id string,
media_source string,
country_code string,
bundle_id string,
carrier string,
language string,
app_id string,
app_version string,
attribution_type string,
os_version string,
device_brand string,
event_type string
)
PARTITIONED BY (day int)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\u0002'
STORED AS TEXTFILE
LOCATION '/user/flume/testHive/';
Hive 建表之后需要 增加相应的分区
ALTER TABLE flume_test2 ADD PARTITION (day = 20161122);
这个操作需要被定时任务或其他类似的东西执行
文章 参考了 http://www.huyanping.cn/flumehive%E5%A4%84%E7%90%86%E6%97%A5%E5%BF%97/
后话
存入Hive除了直接建外部表的方式 也可直接有格式的写入HDFS ,然后使用load data 的方式 写入Hive
若想使用 Hive Sink 的方式 写入Hive,可参考
http://stackoverflow.com/questions/30908641/save-flume-output-to-hive-table-with-hive-sink
http://henning.kropponline.de/2015/05/19/hivesink-for-flume/
另官网的介绍也十分详细,可以参考
PlainJSONHandler
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.test.flume;
import java.io.BufferedReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.servlet.http.HttpServletRequest;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.event.JSONEvent;
import org.apache.flume.source.http.HTTPBadRequestException;
import org.apache.flume.source.http.HTTPSourceHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.gson.JsonParseException;
import com.google.gson.JsonParser;
/**
* PlainJSONHandler for HTTPSource that accepts json-based http body.
*
* This handler throws exception if the deserialization fails because of bad
* format or any other reason.
*/
public class PlainJSONHandler implements HTTPSourceHandler {
private static final String FORWARD_HEADERS = "forwardHeaders";
private static final Logger LOG =
LoggerFactory.getLogger(PlainJSONHandler.class);
private static JsonParser parser = new JsonParser();
private static Set<String> forwardHeaders = new HashSet<String>();
public List<Event> getEvents(HttpServletRequest request) throws Exception {
Map<String,String> eventHeaders = new HashMap<String,String>();
Enumeration requestHeaders = request.getHeaderNames();
while (requestHeaders.hasMoreElements()) {
String header = (String) requestHeaders.nextElement();
if (forwardHeaders.contains(header)) {
eventHeaders.put(header, request.getHeader(header));
}
}
BufferedReader reader = request.getReader();
List<Event> eventList = new ArrayList<Event>(1);
// String line = reader.readLine();
StringBuffer lineBuffer = new StringBuffer();
boolean tag;
do {
lineBuffer.append(reader.readLine());
} while (tag = reader.read() != -1);
if (lineBuffer != null) {
/*try {
parser.parse(line);
} catch (JsonParseException ex) {
throw new HTTPBadRequestException(
"HTTP body is not a valid JSON object.", ex);
}*/
Event event = new JSONEvent();
event.setBody(lineBuffer.toString().getBytes());
event.setHeaders(eventHeaders);
eventList.add(event);
LOG.info("========= Event body:" + new String(event.getBody()) + "==============");
}
return eventList;
}
public void configure(Context context) {
String confForwardHeaders = context.getString(FORWARD_HEADERS);
if (confForwardHeaders != null) {
if (forwardHeaders.addAll(Arrays.asList(confForwardHeaders.split(",")))) {
LOG.debug("forwardHeaders=" + forwardHeaders);
} else {
LOG.error("error to get forward headers from " + confForwardHeaders);
}
} else {
LOG.debug("no forwardHeaders");
}
}
}