typora-copy-images-to: ./
Flume采集数据
1.案例流程描述:
---->伪造数据实时写入
---->Flume读取并写入HDFS(分区并定义拦截器)
---->将文件转换成Hive表
---->Hive使用UDTF重新处理JSON形式的字段行转列为多段数据.
2.执行流程
1.MockMsg
public static void main(String[] args) {
//开启一个线程专门用于发送消息
new Thread(() -> {
try {
//世界的尽头
while (true) {
//获取埋点数据
String event = EventGenerator.generateEvent() + "\r\n";
//向文件写入数据
FileUtils.write(new File("/var/app/mock/msg.log"), event, "utf-8", true);
//休眠0.1秒
Thread.sleep(100);
}
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
}
}).start();
}
//--------------------------------------------------------------
public class EventGenerator {
private static final String[] PAGES = {"home", "about", "contact", "product", "cart", "checkout"};
private static final String[] ACTIONS = {"click", "view", "add", "remove", "submit"};
private static final String[] DEVICES = {"iPhone", "iPad", "Samsung Galaxy", "Google Pixel", "Huawei", "Xiaomi"};
private static final String[] BROWSERS = {"Chrome", "Safari", "Firefox", "Edge", "Opera"};
private static final String[] LANGUAGES = {"en-US", "en-GB", "fr-FR", "fr-CA", "de-DE", "es-ES", "it-IT", "ja-JP", "ko-KR", "zh-CN", "zh-TW"};
private static final Random random = new Random();
/**
* 随机生成一条埋点数据
*
* @return
*/
public static String generateEvent() {
String page = PAGES[random.nextInt(PAGES.length)];
String action = ACTIONS[random.nextInt(ACTIONS.length)];
String user = UUID.randomUUID().toString();
String product = generateProduct();
Map<String, Object> properties = new HashMap<>();
properties.put("page", page);
properties.put("action", action);
properties.put("user_id", user);
properties.put("product_id", product);
properties.put("device_type", DEVICES[random.nextInt(DEVICES.length)]);
properties.put("browser", BROWSERS[random.nextInt(BROWSERS.length)]);
properties.put("language", LANGUAGES[random.nextInt(LANGUAGES.length)]);
properties.put("date", new SimpleDateFormat("yyyy年MM月dd日 HH时mm分ss秒").format(new Date()));
properties.put("duration", random.nextInt(30) + 1);
properties.put("referrer", generateReferrer());
properties.put("location", generateLocation());
return String.format("%s", properties);
}
private static String generateProduct() {
String[] categories = {"Electronics", "Books", "Clothing", "Beauty", "Sports", "Toys", "Home"};
String category = categories[random.nextInt(categories.length)];
String[] products = {"iPhone X", "Samsung Galaxy S10", "iPad Pro", "MacBook Pro", "Kindle", "Harry Potter", "The Lord of the Rings",
"Levi's Jeans", "Nike Air Max", "MAC Lipstick", "Adidas Running Shoes", "Wilson Tennis Racket", "Sony TV", "Dyson Vacuum Cleaner"};
String product = products[random.nextInt(products.length)];
return String.format("%s (%s)", product, category);
}
private static String generateReferrer() {
String[] referrers = {"Google", "Facebook", "Twitter", "LinkedIn", "Instagram", "YouTube", "Amazon"};
return referrers[random.nextInt(referrers.length)];
}
private static String generateLocation() {
String[] cities = {"New York", "London", "Paris", "Berlin", "Tokyo", "Sydney", "Toronto", "San Francisco", "Singapore", "Shanghai"};
String city = cities[random.nextInt(cities.length)];
String[] countries = {"United States", "United Kingdom", "France", "Germany", "Japan", "Australia", "Canada", "China", "Singapore"};
String country = countries[random.nextInt(countries.length)];
return String.format("%s-%s", city, country);
}
}
在Artifacts设置主类build此Artifacts后上传至Linux服务器执行,
java -jar mockMsg.jar
2.自定义拦截器
定义后打包上传至Flume的/opt/app/apache-flume-1.11.0-bin/lib/目录下
public class LogTsInterceptor implements Interceptor {
// 声明一个存放事件的List
private List<Event> eventList;
@Override
public void initialize() {
//声明对象
eventList = new ArrayList<>();
}
@Override
public Event intercept(Event event) {
try {
//获取请求头信息
Map<String, String> headers = event.getHeaders();
//获取请求体信息
String body = new String(event.getBody());
//根据自己的需求处理数据
String regex = ".*date=(.+?),.*";
Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(body);
if (matcher.matches()) {
String dateString = matcher.group(1);
Date date = new SimpleDateFormat("yyyy年MM月dd日 HH时mm分ss秒").parse(dateString);
//为Header设置ts
headers.put("timestamp", String.valueOf(date.getTime()));
}
} catch (ParseException e) {
e.printStackTrace();
}
//返回处理后的结果
return event;
}
@Override
public List<Event> intercept(List<Event> list) {
//清空历史记录
eventList.clear();
//依次处理数据
for (Event event : list) {
eventList.add(intercept(event));
}
//返回结果
return eventList;
}
@Override
public void close() {
}
/**
* 通过该静态内部类来创建自定义对象供flume使用,实现Interceptor.Builder接口,并实现其抽象方法
*/
public static class Builder implements Interceptor.Builder {
/**
* 该方法主要用来返回创建的自定义类拦截器对象
*
* @return
*/
@Override
public Interceptor build() {
return new LogTsInterceptor();
}
/**
* 用来接收flume配置自定义拦截器参数
*
* @param context 通过该对象可以获取flume配置自定义拦截器的参数
*/
@Override
public void configure(Context context) {
}
}
3.Flume采集
在此之前确保hdfs处于正常运行状态
##定义a1的三个组件的名称
a1.sources = r1
a1.sinks = k1
a1.channels = c1
##定义Source的类型
a1.sources.r1.type = TAILDIR
a1.sources.r1.positionFile = /var/app/taildir2hdfs_position.json
a1.sources.r1.filegroups = f1
a1.sources.r1.filegroups.f1 =/var/app/mock/msg.log
##定义Channel的类型
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
##定义Sink的类型
a1.sinks.k1.type=hdfs
a1.sinks.k1.hdfs.path=hdfs://xxx/flume/data/%Y%m%d/%H%M%S
##每隔60s或者文件大小超过100M的时候产生新文件
##hdfs有多少条消息时新建文件,0不基于消息个数
a1.sinks.k1.hdfs.rollCount=0
##hdfs创建多长时间新建文件,0不基于时间
a1.sinks.k1.hdfs.rollInterval=60
##hdfs多大时新建文件,0不基于文件大小
a1.sinks.k1.hdfs.rollSize=104857600
##当目前被打开的临时文件在该参数指定的时间(秒)内,没有任何数据写入,则将该临时文件关闭并重命名成目标文件
a1.sinks.k1.hdfs.idleTimeout=3
a1.sinks.k1.hdfs.fileType=DataStream
a1.sinks.k1.hdfs.useLocalTimeStamp=false
##每x分钟生成一个目录:
##是否启用时间上的”舍弃”,这里的”舍弃”,类似于”四舍五入”,后面再介绍。如果启用,则会影响除了%t的其他所有时间表达式
a1.sinks.k1.hdfs.round=true
##时间上进行“舍弃”的值;
a1.sinks.k1.hdfs.roundValue=2
##时间上进行”舍弃”的单位,包含:second,minute,hour
a1.sinks.k1.hdfs.roundUnit=minute
#定义拦截器,并指定拦截器全类名
a1.sources.r1.interceptors=i6
a1.sources.r1.interceptors.i6.type=com.sherlockl.interceptor.LogTsInterceptor$Builder
##组装source channel sink
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
4.HDFS检验结果
#在验收之前按顺序执行,先开启HDFS,然后运行Flume采集程序,然后运行MockMsg程序
zookeeper start
start-all.sh
flume-ng agent -n a1 -c conf -f ${FLUME_HOME}/options/tail2hdfs.conf -Dflume.root.logger=INFO,console
java -jar MockMsg.jar
E(x)期待:
60秒生成一个文件,2分钟生成一个目录,(/年月日/时分秒)每分钟生成一个文件(若超过100M也会生成一个文件),根据根据拦截器定义的内容只会呈现和正则匹配的数据并且消息头含有数据生成的时间戳.
如图:
5.转存到hive上
create database if not exists ;
CREATE TABLE IF NOT EXISTS ods.ods_logevent (
event STRING
)PARTITIONED BY (ymd STRING)
ROW FORMAT DELIMITED LINES TERMINATED BY '\n';
LOAD DATA INPATH '/flume/data/20240103/*/*' INTO TABLE ods.ods_logevent partition(ymd='20240103');
select * from ods.ods_logevent;
6.数据ODS->DWD切分维度
1.编写UDTF函数
编写后打包上传至HDFS上
public class LogLogSplitUDTF extends GenericUDTF {
/**
* 实例化 UDTF 对象,判断传入参数的长度以及数据类型
*
* @param argOIs
* @return
* @throws UDFArgumentException
*/
@Override
public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException {
// 获取入参
List<? extends StructField> fieldRefs = argOIs.getAllStructFieldRefs();
// 参数校验,判断传入参数的长度以及数据类型
if (fieldRefs.size() != 1) {
throw new UDFArgumentLengthException("参数个数必须为 1");
}
if (!ObjectInspector.Category.PRIMITIVE.equals(fieldRefs.get(0).getFieldObjectInspector().getCategory())) {
throw new UDFArgumentTypeException(0, "参数类型必须为 String");
}
// 自定义函数输出的字段和类型
// 创建输出字段名称的集合
ArrayList<String> columnNames = new ArrayList<>();
// 创建字段数据类型的集合
ArrayList<ObjectInspector> columnType = new ArrayList<>();
columnNames.add("user_id");
columnType.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
columnNames.add("product_id");
columnType.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
columnNames.add("duration");
columnType.add(PrimitiveObjectInspectorFactory.javaIntObjectInspector);
columnNames.add("page");
columnType.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
columnNames.add("action");
columnType.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
columnNames.add("device_type");
columnType.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
columnNames.add("browser");
columnType.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
columnNames.add("language");
columnType.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
columnNames.add("referrer");
columnType.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
columnNames.add("location");
columnType.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
columnNames.add("ts");
columnType.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, columnType);
}
/**
* 处理数据
*
* @param args
* @throws HiveException
*/
@Override
public void process(Object[] args) throws HiveException {
try {
//存储要输出的列的值
Object[] outColumnValue = new Object[11];
//定义一个Map存储拆分后的数据
Map<String, String> map = new HashMap<>();
if (args[0] != null) {
//替换大括号,并分割属性
String[] attrs = args[0].toString().replaceAll("\\{|\\}", "").split(",");
//遍历
for (String attr : attrs) {
String[] kv = attr.split("=");
map.put(kv[0].trim(), kv[1].trim());
}
//开始拼接数据
outColumnValue[0] = map.get("user_id");
outColumnValue[1] = map.get("product_id");
outColumnValue[2] = Integer.parseInt(map.get("duration"));
outColumnValue[3] = map.get("page");
outColumnValue[4] = map.get("action");
outColumnValue[5] = map.get("device_type");
outColumnValue[6] = map.get("browser");
outColumnValue[7] = map.get("language");
outColumnValue[8] = map.get("referrer");
outColumnValue[9] = map.get("location");
outColumnValue[10] = String.valueOf(new SimpleDateFormat("yyyy年MM月dd日 HH时mm分ss秒").parse(map.get("date")).getTime());
//将值进行输出
forward(outColumnValue);
} else {
//传入的数据格式有误
System.out.println("LogLogSplitUDTF.process【" + args + "】");
}
} catch (ParseException e) {
e.printStackTrace();
}
}
@Override
public void close() throws HiveException {
}
}
2.在hive上创建函数
CREATE FUNCTION MY_UDTF AS 'com.sherlock.hive.LogLogSplitUDTF'
USING JAR 'hdfs:///jar/dw_01mockdata-1.0-SNAPSHOT.jar';
-- 查看函数详细信息
DESC FUNCTION EXTENDED MY_UDTF;
3.建立DWD表
按日期分区
CREATE TABLE IF NOT EXISTS dwd.dwd_event(
userid STRING,
product_id STRING,
browser STRING,
action STRING,
device_type STRING,
language STRING,
location STRING,
page STRING,
referrer STRING,
duration STRING,
ts BIGINT
)partitioned by (ymd STRING)
row format delimited fields terminated by ',' lines terminated by "\n";
4.执行插入操作并验收结果
insert into dwd.dwd_event partition (ymd='20240103') select MY_UDTF(event) from ods.ods_logevent;
select * from dwd.dwd_event;