收集30分钟之内的action_id,聚合为一个list
<!-- maven配置文件 -->
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>netease.bigdata.course</groupId>
<artifactId>etl</artifactId>
<packaging>pom</packaging>
<version>1.0-SNAPSHOT</version>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.encoding>UTF-8</maven.compiler.encoding>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.6</version>
<scope>provided</scope> <!-- 编译时引用此jar包,但打包的执行程序不会包含 -->
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.4</version>
</dependency>
<dependency>
<groupId>org.anarres.lzo</groupId>
<artifactId>lzo-hadoop</artifactId>
<version>1.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>1.2.2</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-contrib</artifactId>
<version>1.2.2</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
</dependency>
</dependencies>
<build>
<sourceDirectory>src/main/java</sourceDirectory>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>6</source>
<target>6</target>
</configuration>
</plugin>
</plugins>
</build>
</project>
package com.bigdata.etl.udf;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StandardMapObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class UDAFCollectIn30Minutes extends AbstractGenericUDAFResolver {
@Override
public GenericUDAFEvaluator getEvaluator(TypeInfo[] info) throws SemanticException {
if (info.length != 2) {
throw new UDFArgumentTypeException(info.length - 1, "Exactly two arguments is expected.");
}
if (info[0].getCategory() != ObjectInspector.Category.PRIMITIVE) {
throw new UDFArgumentTypeException(0, "Only primitive type arguments are accepted.");
}
if (info[1].getCategory() != ObjectInspector.Category.PRIMITIVE) {
throw new UDFArgumentTypeException(0, "Only primitive type arguments are accepted.");
}
return new CollectActiveNameUDAFEvaluator();
}
public static class CollectActiveNameUDAFEvaluator extends GenericUDAFEvaluator {
protected PrimitiveObjectInspector inputKeyOI; // 输入参数0
protected PrimitiveObjectInspector inputValueOI; // 输入参数1
protected StandardMapObjectInspector internalMergeOI;
@Override
public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException {
super.init(m, parameters);
if (m == Mode.PARTIAL1) {
inputKeyOI = (PrimitiveObjectInspector) parameters[0];
inputValueOI = (PrimitiveObjectInspector) parameters[1];
return ObjectInspectorFactory.getStandardMapObjectInspector(ObjectInspectorUtils.getStandardObjectInspector(inputKeyOI), ObjectInspectorUtils.getStandardObjectInspector(inputValueOI));
} else if (m == Mode.PARTIAL2) {
internalMergeOI = (StandardMapObjectInspector) parameters[0];
inputKeyOI = (PrimitiveObjectInspector) internalMergeOI.getMapKeyObjectInspector();
inputValueOI = (PrimitiveObjectInspector) internalMergeOI.getMapValueObjectInspector();
return ObjectInspectorUtils.getStandardObjectInspector(internalMergeOI);
} else if (m == Mode.FINAL) {
internalMergeOI = (StandardMapObjectInspector) parameters[0];
inputKeyOI = (PrimitiveObjectInspector) internalMergeOI.getMapKeyObjectInspector();
inputValueOI = (PrimitiveObjectInspector) internalMergeOI.getMapValueObjectInspector();
return ObjectInspectorFactory.getStandardListObjectInspector(inputValueOI);
} else { // COMPLETE阶段 直接输入 timeTag和active_name
inputKeyOI = (PrimitiveObjectInspector) parameters[0];
inputValueOI = (PrimitiveObjectInspector) parameters[1];
return ObjectInspectorFactory.getStandardListObjectInspector(inputValueOI);
}
}
static class activeNameMapTimeAgg extends AbstractAggregationBuffer {
Map<Object, Object> container = Maps.newHashMap();
}
public AbstractAggregationBuffer getNewAggregationBuffer() {
activeNameMapTimeAgg ret = new activeNameMapTimeAgg();
return ret;
}
public void reset(AggregationBuffer agg) {
((activeNameMapTimeAgg) agg).container.clear();
}
public void iterate(AggregationBuffer agg, Object[] parameters) {
assert (parameters.length == 2);
Object key = parameters[0];
Object value = parameters[1];
if (key != null && value != null) {
activeNameMapTimeAgg my_agg = (activeNameMapTimeAgg) agg;
Object kCopy = ObjectInspectorUtils.copyToStandardObject(key, this.inputKeyOI);
Object vCopy = ObjectInspectorUtils.copyToStandardObject(value, this.inputValueOI);
my_agg.container.put(kCopy, vCopy);
}
}
public Object terminatePartial(AggregationBuffer agg) {
activeNameMapTimeAgg my_agg = (activeNameMapTimeAgg) agg;
Map<Object, Object> ret = Maps.newHashMap(my_agg.container);
return ret;
}
public void merge(AggregationBuffer agg, Object partial) {
assert (partial != null);
activeNameMapTimeAgg my_agg = (activeNameMapTimeAgg) agg;
Map<Object, Object> partialResult = (Map<Object, Object>) internalMergeOI.getMap(partial);
for (Map.Entry<Object, Object> entry : partialResult.entrySet()) {
Object kCopy = ObjectInspectorUtils.copyToStandardObject(entry.getKey(), this.inputKeyOI);
Object vCopy = ObjectInspectorUtils.copyToStandardObject(entry.getValue(), this.inputValueOI);
my_agg.container.put(kCopy, vCopy);
}
}
public Object terminate(AggregationBuffer agg) {
activeNameMapTimeAgg my_agg = (activeNameMapTimeAgg) agg;
Map map = new HashMap(my_agg.container.size());
map.putAll(my_agg.container);
List<Map.Entry<LongWritable, Text>> listData = Lists.newArrayList(map.entrySet());
Collections.sort(listData, new Comparator<Map.Entry<LongWritable, Text>>() {
public int compare(Map.Entry<LongWritable, Text> o1, Map.Entry<LongWritable, Text> o2) {
return (o1.getKey().compareTo(o2.getKey()));
}
});
List<Text> result = Lists.newArrayList();
LongWritable currTime = listData.get(listData.size() - 1).getKey();
for (Map.Entry<LongWritable, Text> entry : listData) {
Long timeInterval = (currTime.get() - entry.getKey().get()) / 60000;
if (timeInterval <= 30) {
result.add(entry.getValue());
}
}
return result;
}
}
}