Bigdata 实录踩坑
hive 建表
CREATE EXTERNAL TABLE IF NOT EXISTS `ods_na_expense` (
`id` bigint,
`user_id` bigint,
`price` bigint COMMENT '消耗的阅读币',
`created_at` string,
`book_id` bigint COMMENT '书籍id',
`chapte_id` bigint COMMENT '章节id',
`updated_at` string
)
COMMENT '消费记录'
partitioned by( dt STRING) ROW FORMAT DELIMITED
FIELDS TERMINATED BY "001"
STORED AS
INPUTFORMAT "com.hadoop.mapred.DeprecatedLzoTextInputFormat"
OUTPUTFORMAT "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"
LOCATION '/warehouse/youshu/ods/ods_db/na_expense
EXTERNAL 是建立外部表的关键字,外部表的数据由hdfs管理.删除外部表的时候,只会删除外部表的元数据,而不会删除hdfs上的数据.这也是我们为什么要建外部表的原因.
这里的建表语句 inputformat使用的是Lzo ,outputFormat用的是kv
查询是input 与inputFormat有关,写入时是output 与outputFormat有关
这里我们导入数据的字段用了特殊字符"001",记得在导数据的时候要观察一下有没有特殊字符问题.
fluem采集埋点日志
flume2kafka
我们为什么要先用flume采集到kafka再从kafka中使用flume采集到hdfs上 是为了进行数据的保护.
flume2kafka.conf
al.sources = s1
al.channels = c1
#配置sources
al.sources.s1.type = TAILDIR
al.sources.s1.channels = c1
al.sources.s1.filegroups = f1
al.sources.s1.filegroups.f1 = /opt/module/applogs/log/.*
al.sources.s1.TAILDIR = /opt/module/applogs/log/.*
al.sources.s1.fileHeader = true
al.sources.s1.positionFile = /opt/module/flume-1.9.0/log/taildir_position.json
#配置channels
al.channels.c1.type = org.apache.flume.channel.kafka.KafkaChannel
al.channels.c1.kafka.bootstrap.servers = test-cluster101:9092,test-cluster102:9092,test-cluster103:9092
al.channels.c1.kafka.topic = recommender_systems_log
al.channels.c1.parseAsFlumeEvent = false
#配置生产者
al.channels.c1.kafka.producer.acks =1
al.channels.c1.kafka.producer.batch.size = 100000
al.channels.c1.kafka.producer.linger.ms = 5000
#配置拦截器
al.sources.s1.interceptors = i1
al.sources.s1.interceptors.i1.type = com.youshu.LogETLInterceptor$Builder
我们的sources使用的是TAILDIR,它可以实现监控一个目录下的多个文件
我们的channel使用的是kafkaChannel,这里我们省去了sink
在flume到kafka阶段我们自定义了一个拦截器,他的功能是:判断日子里的数据是不是json,再对"start_time"的值进行判断
代码如下
package com.youshu;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.interceptor.Interceptor;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
public class LogETLInterceptor implements Interceptor{
public void initialize() {
}
/**
* 处理单条数据
* @author 杜志伟
* @Date 2021/6/4 16:06
* @param event
* @return
*/
public Event intercept(Event event) {
//取出数据
String json = new String(event.getBody());
// 获取今天时间 转化为我们需要的2020-11-11的这种格式
Date date = new Date();
SimpleDateFormat dt= new SimpleDateFormat("yyy-MM-dd");
String today = dt.format(date);
// 获取昨天时间
Calendar cal = Calendar.getInstance();
cal.add(Calendar.DAY_OF_MONTH, -1);
Date time = cal.getTime();
String yesterday = dt.format(time);
try {
JsonObject obj = new JsonParser().parse(json).getAsJsonObject();
if (obj != null){
String s = obj.get("start_time").getAsString();
String[] s1 = s.split(" ");
String start_time = s1[0];
if (start_time.equals(today)|| start_time.equals(yesterday)){
return event;
}else{
return null;
}
}else{
return null;
}
}catch (Exception e){
return null;
}
}
/**
* 批量处理数据
* @author 杜志伟
* @Date 2021/6/4 16:06
* @param list
* @return
*/
public List<Event> intercept(List<Event> list) {
final Iterator<Event> it = list.iterator();
while(it.hasNext()){
Event event = intercept(it.next());
if (event==null) it.remove();
}
return list;
}
public void close() {
}
public static class Builder implements Interceptor.Builder {
@Override
public Interceptor build() {
return new LogETLInterceptor();
}
@Override
public void configure(Context context) {
}
}
}
将写好的代码打成jar包上传到flume/lib的目录下,如果用到了阿里的fastjson,记得将fastjson jar包也导入flume/lib的文件夹下,否则会报错.
在flume2kafka.conf中配置自定义拦截器是要与代码中写的类名字一样.否则日志会显示找不到$Builder
flume2hdfs
flume2hdfs.conf
#自定义
al.sources = kafkaSource
al.channels = memoryChannel
al.sinks = hdfsSink
#绑定
al.sources.kafkaSource.channels = memoryChannel
al.sinks.hdfsSink.channel = memoryChannel
#自定义sources
al.sources.kafkaSource.type = org.apache.flume.source.kafka.KafkaSource
al.sources.kafkaSource.bathSize = 5000
al.sources.kafkaSource.batchDurationMillis = 2000
al.sources.kafkaSource.kafka.bootstrap.servers =test-cluster101:9092,test-cluster102:9092
al.sources.kafkaSource.topic = recommender_systems_log
al.sources.kafkaSource.kafka.consumer= flume2hdfs
al.sources.kafkaSource.kafka.consumer.timeout.ms = 100
#自定义channel
al.channels.memoryChannel.type = memory
al.channels.memoryChannel.capacity=10000
al.channels.memoryChannel.transactionCapacity=1000
#自定义sink
al.sinks.hdfsSink.type = hdfs
al.sinks.hdfsSink.hdfs.codeC = lzop
al.sinks.hdfsSink.hdfs.path = /origin_data/youshuge/log/recommender_systems_log/%{start_time}/
al.sinks.hdfsSink.hdfs.writeFormat = Text
al.sinks.hdfsSink.hdfs.fileType = CompressedStream
#副本策略
a1.sinks.hdfsSink.hdfs.minBlockReplicas=1
al.sinks.hdfsSink.hdfs.rollSize = 134000000
al.sinks.hdfsSink.hdfs.rollCount = 0
al.sinks.hdfsSink.hdfs.rollInterval = 3600
al.sinks.hdfsSink.hdfs.filePrefix= ysg
al.sinks.hdfsSink.hdfs.fileSuffix=
al.sinks.hdfsSink.hdfs.inUserPrefix=_
al.sinks.hdfsSink.hdfs.inUserSuffix=
al.sources.kafkaSource.interceptors=i1
al.sources.kafkaSource.interceptors.i1.type=com.youshuge.logETLInterceptor$Builder
al.sources.kafkaSource.kafka.consumer= flume2hdfs 这条配置指定了kafka的消费者flume2hdfs,这里需要去kafka conf下的consumer.properies里配置consumer group-Id 否则会报错.
al.sinks.hdfsSink.hdfs.path = /origin_data/youshuge/log/recommender_systems_log/%{start_time}/ 这条配置是为了防止临界时间
为此我们写了flume自定义拦截器
代码如下
package com.youshuge;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.interceptor.Interceptor;
import java.text.SimpleDateFormat;
import java.util.*;
public class logETLInterceptor implements Interceptor {
@Override
public void initialize() {
}
@Override
public Event intercept(Event event) {
//取出数据
String json = new String(event.getBody());
Map<String, String> headers = event.getHeaders();
//将获取的数据转换为json
try {
event.setHeaders(headers);
JsonObject obj = new JsonParser().parse(json).getAsJsonObject();
String start_time1 = obj.get("start_time").getAsString();
String[] s = start_time1.split(" ");
String start_time = s[0];
headers.put("start_time", start_time);
event.setHeaders(headers);
return event;
}catch (Exception e){
return null;
}
}
@Override
public List<Event> intercept(List<Event> events) {
//准备list集合接收拦截之后的event
ArrayList<Event> list = new ArrayList<>();
//循环遍历接收数据,然后调用单条批量处理逻辑
for (Event event : events) {
Event newEvent = intercept(event);
if (newEvent != null){
list.add(newEvent);
}
}
return list;
}
@Override
public void close() {
}
public static class Builder implements Interceptor.Builder{
@Override
public Interceptor build() {
return new logETLInterceptor();
}
@Override
public void configure(Context context) {
}
}
}
临界时间问题是指23:59:59秒产生的数据通过flume采集到hdfs上是已经到了第二天会被分配到第二天的文件下.通过上述操作(配置和自定义拦截器)来解决此问题.
纯属手码,个人理解,欢迎讨论.
转载请告知