Flume 小记(二) HttpSource 存入Hive

HttpSource -> HDFS Sink

这里采用了直接存入HDFS,并在在Hive 建外部表 映射到同一地址的方法间接存入Hive库, 其中HDFS Sink 的 配置参数可参考 http://lxw1234.com/archives/2015/10/527.htm

配置文件

 a1.sources=r1
 a1.sinks=k1
 a1.channels=c1
 a1.sources.r1.type=http
 a1.sources.r1.bind=localhost
 a1.sources.r1.port=50000
 a1.sources.r1.channels=c1
 a1.sources.r1.handler=com.test.flume.PlainJSONHandler
 a1.sources.r1.interceptors = i1 i2
 a1.sources.r1.interceptors.i1.preserveExisting = true
 a1.sources.r1.interceptors.i1.type = timestamp
 a1.sources.r1.interceptors.i2.type = host
 a1.sources.r1.interceptors.i2.hostHeader = hostname
 
 a1.sinks.k1.channel=c1
 a1.sinks.k1.type=hdfs
 #a1.sinks.k1.hdfs.path = hdfs://ip:port/user/hive/warehouse/flume_test
 a1.sinks.k1.hdfs.path=/user/flume/testHive/day=%Y%m%d/
 a1.sinks.k1.serializer = com.test.flume.HiveSerializer$Builder
 a1.sinks.k1.hdfs.writeFormat = Text
 a1.sinks.k1.hdfs.fileType = DataStream
 
 ## file roll
 a1.sinks.k1.hdfs.filePrefix = log_%Y%m%d_%H
 a1.sinks.k1.hdfs.useLocalTimeStamp = true
 a1.sinks.k1.hdfs.minBlockReplicas=1
 a1.sinks.k1.hdfs.rollInterval = 3600
 a1.sinks.k1.hdfs.rollCount = 0
 a1.sinks.k1.hdfs.rollSize = 128000000
 a1.sinks.k1.hdfs.batchSize = 100
 # close file time 
 a1.sinks.k1.hdfs.idleTimeout=60
 
 a1.channels.c1.type=memory
 a1.channels.c1.capacity=1000
 a1.channels.c1.transactionCapacity=100


 注:

  (1) 默认的 serializer 只会往外写body里的数据,想要 加入其他信息(如timestamp 和 ip )需自定义 serializer,实现 EventSerializer 并重写 其 write 方法即可xiang

  (2) 文件滚动时需要对文件名自定义,否则一次请求会产生一个文件,这里的文件滚动设置为60秒新增一个文件,也可基于其他配置(如rollcount - event 个数 和 rollsize - 文件大小)进行滚动

  (3) 如果想写入Hive 库,向外写出时, 需保证写出顺序和Hive 表的字段顺序一致

  (4) 目录采用 day=xxx 是为了匹配 Hive 表的分区


HiveSerializer

package com.test.flume;

import java.io.IOException;
import java.io.OutputStream;

import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.interceptor.TimestampInterceptor;
import org.apache.flume.serialization.EventSerializer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.alibaba.fastjson.JSON;
import com.test.flume.vo.AppsFlyerLogVo;

public class HiveSerializer implements EventSerializer{
    
    private final static Logger logger = LoggerFactory.getLogger(HiveSerializer.class);
    public static final String FORMAT = "format";
    public static final String REGEX = "regex";
    public static final String REGEX_ORDER = "regexorder";
    
//    private final String DEFAULT_FORMAT = "CSV";
    
//    private final String format;
    private final OutputStream out;
    
    private HiveSerializer(OutputStream out, Context ctx) {
//        this.format = ctx.getString(FORMAT, DEFAULT_FORMAT);
//        if (!format.equals(DEFAULT_FORMAT)){
//            logger.warn("Unsupported output format" + format + ", using default instead");
//        }
        this.out = out;
    }

    public void write(Event event) throws IOException {
     // first write out the timestamp
        String timestamp = event.getHeaders().get(TimestampInterceptor.Constants.TIMESTAMP);
        if (timestamp == null || timestamp.isEmpty()){
            long now = System.currentTimeMillis();
            timestamp = Long.toString(now);
        }
        String bodyStr = new String(event.getBody());
        AppsFlyerLogVo vo = JSON.parseObject(bodyStr, AppsFlyerLogVo.class);
        logger.info("======== Serializer ========= " + vo.toString() + " ================== ");
        writeLogVo(vo);
        out.write('\n');
        
    }


    private void writeLogVo(AppsFlyerLogVo vo) {
        try {
            // 根据 hive 表字段顺序
            writeFiled(vo.getDeviceModel());
            writeFiled(vo.getDownloadTimeSelectedTimezone());
            writeFiled(vo.getDownloadTime());
            writeFiled(vo.getOperator());
            writeFiled(vo.getIp());
            writeFiled(vo.getAppName());
            writeFiled(vo.getCity());
            writeFiled(vo.getCustomerUserId());
            writeFiled(vo.getInstallTimeSelectedTimezone());
            writeFiled(vo.getEventName());
            writeFiled(vo.getEventTimeSelectedTimezone());
            writeFiled(vo.getIsRetargeting());
            writeFiled(vo.getInstallTime());
            writeFiled(vo.getEventTime());
            writeFiled(vo.getPlatform());
            writeFiled(vo.getSdkVersion());
            writeFiled(vo.getAppsflyerDeviceId());
            writeFiled(vo.getSelectedCurrency());
            writeFiled(vo.getWifi());
            writeFiled(vo.getAdvertisingId());
            writeFiled(vo.getMediaSource());
            writeFiled(vo.getCountryCode());
            writeFiled(vo.getBundleId());
            writeFiled(vo.getCarrier());
            writeFiled(vo.getLanguage());
            writeFiled(vo.getAppId());
            writeFiled(vo.getAppVersion());
            writeFiled(vo.getAttributionType());
            writeFiled(vo.getOsVersion());
            writeFiled(vo.getDeviceBrand());
            writeFiled(vo.getEventType());
            
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    
    private void writeFiled(String filed){
        try {
            if (filed != null) {
                out.write(filed.getBytes());
                out.write('\u0002');
            }else {
                out.write('\u0002');
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    public void afterCreate() throws IOException {
        // TODO Auto-generated method stub
        
    }


    public void afterReopen() throws IOException {
        // TODO Auto-generated method stub
        
    }


    public void flush() throws IOException {
        // TODO Auto-generated method stub
        
    }


    public void beforeClose() throws IOException {
        // TODO Auto-generated method stub
        
    }


    public boolean supportsReopen() {
        // TODO Auto-generated method stub
        return false;
    }

    public EventSerializer build(Context context, OutputStream out) {
        // TODO Auto-generated method stub
        return null;
    }
    
    public static class Builder implements EventSerializer.Builder {
        public EventSerializer build(Context context, OutputStream out) {
            HiveSerializer s = new HiveSerializer(out, context);
            return s;
        }
    }

}

Hive 建表

DROP TABLE flume_test2;
CREATE EXTERNAL TABLE IF NOT EXISTS flume_test2(
device_model string,
download_time_selected_timezone string,
download_time string,
operator string,
ip string,
app_name string,
city string,
customer_user_id string,
install_time_selected_timezone string,
event_name string,
event_time_selected_timezone string,
is_retargeting string,
install_time string,
event_time string,
platform string,
sdk_version string,
appsflyer_device_id string,
selected_currency string,
wifi string,
advertising_id string,
media_source string,
country_code string,
bundle_id string,
carrier string,
language string,
app_id string,
app_version string,
attribution_type string,
os_version string,
device_brand string,
event_type string
)
PARTITIONED BY (day int)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\u0002'
STORED AS TEXTFILE
LOCATION '/user/flume/testHive/';

Hive 建表之后需要 增加相应的分区

ALTER TABLE flume_test2 ADD PARTITION (day = 20161122);

这个操作需要被定时任务或其他类似的东西执行


文章 参考了 http://www.huyanping.cn/flumehive%E5%A4%84%E7%90%86%E6%97%A5%E5%BF%97/

后话

  存入Hive除了直接建外部表的方式 也可直接有格式的写入HDFS ,然后使用load data 的方式 写入Hive

  

  若想使用 Hive Sink 的方式 写入Hive,可参考

  http://stackoverflow.com/questions/30908641/save-flume-output-to-hive-table-with-hive-sink

  http://henning.kropponline.de/2015/05/19/hivesink-for-flume/

  另官网的介绍也十分详细,可以参考 


PlainJSONHandler

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.test.flume;
import java.io.BufferedReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.servlet.http.HttpServletRequest;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.event.JSONEvent;
import org.apache.flume.source.http.HTTPBadRequestException;
import org.apache.flume.source.http.HTTPSourceHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.gson.JsonParseException;
import com.google.gson.JsonParser;
/**
 * PlainJSONHandler for HTTPSource that accepts json-based http body.
 *
 * This handler throws exception if the deserialization fails because of bad
 * format or any other reason.
 */
public class PlainJSONHandler implements HTTPSourceHandler {
   
  private static final String FORWARD_HEADERS = "forwardHeaders";
  private static final Logger LOG =
    LoggerFactory.getLogger(PlainJSONHandler.class);
  private static JsonParser parser = new JsonParser();
  private static Set<String> forwardHeaders = new HashSet<String>();
  public List<Event> getEvents(HttpServletRequest request) throws Exception {
    Map<String,String> eventHeaders = new HashMap<String,String>();
    Enumeration requestHeaders = request.getHeaderNames();
    while (requestHeaders.hasMoreElements()) {
      String header = (String) requestHeaders.nextElement();
      if (forwardHeaders.contains(header)) {
        eventHeaders.put(header, request.getHeader(header));
      }
    }
    BufferedReader reader = request.getReader();
    List<Event> eventList = new ArrayList<Event>(1);
//    String line = reader.readLine();
    StringBuffer lineBuffer = new StringBuffer();
    boolean tag;
    do {
        lineBuffer.append(reader.readLine());
    } while (tag = reader.read() != -1);
    if (lineBuffer != null) {
      /*try {
        parser.parse(line);
      } catch (JsonParseException ex) {
        throw new HTTPBadRequestException(
          "HTTP body is not a valid JSON object.", ex);
      }*/
      Event event = new JSONEvent();
      event.setBody(lineBuffer.toString().getBytes());
      event.setHeaders(eventHeaders);
      eventList.add(event);
       
      LOG.info("========= Event body:" + new String(event.getBody()) + "==============");
    }
    return eventList;
  }
  public void configure(Context context) {
    String confForwardHeaders = context.getString(FORWARD_HEADERS);
    if (confForwardHeaders != null) {
      if (forwardHeaders.addAll(Arrays.asList(confForwardHeaders.split(",")))) {
        LOG.debug("forwardHeaders=" + forwardHeaders);
      } else {
        LOG.error("error to get forward headers from " + confForwardHeaders);
      }
    } else {
      LOG.debug("no forwardHeaders");
    }
  }
}



  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值