下面例子是自己在实验udaf写法,中间踩了不少坑,写在这里做个笔记
udaf类实现
package com.test.newsevent;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
import org.apache.hadoop.hive.ql.exec.UDAF;
import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
@SuppressWarnings("deprecation")
public class GetNewsTitleInfoStatistics extends UDAF {
// 记录新闻标题统计信息
public static class NewsTitleInfo implements Serializable {
private String title;// 标题名称
private String url;// url
private String isCredible;// 新闻来源是否可信 "no":不可靠 "yes":可靠
private int total;// 该条新闻的总数
public NewsTitleInfo() {
}
public NewsTitleInfo(String title, int total, String isCredible, String url) {
super();
this.title = title;
this.total = total;
this.isCredible = isCredible;
this.url = url;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getIsCredible() {
return isCredible;
}
public void setIsCredible(String isCredible) {
this.isCredible = isCredible;
}
public int getTotal() {
return total;
}
public void setTotal(int total) {
this.total = total;
}
public String toString(){
return title + "#@#@" + total + "#@#@" + isCredible + "#@#@" + url;
}
}
public static class Evaluator implements UDAFEvaluator{
private Map<String, NewsTitleInfo> titleStatisticInfoMap;
public Evaluator(){
super();
init();
}
// 初始化
public void init() {
titleStatisticInfoMap = new HashMap<String, NewsTitleInfo>();
}
// map阶段,遍历所有记录
public boolean iterate(String title, String author, String url){
if(title == null || author == null || url == null){
return true;
}
// 新闻标题
title = title.trim();
title = title.replaceAll("·", "·");
title = title.replaceAll(""", "\"");
if(title.equals("")){
return true;
}
// 新闻来源 author
author = author.trim();
// 判断该新闻是否来自指定站点新闻
String tempIsCredible = "no";
/*if(author.contains("新浪") || author.contains("搜狐")){
tempIsCredible = "yes";
}*/
if(author.contains("时光网")){
tempIsCredible = "yes";
}
if(tempIsCredible.equals("no")){
return true;
}
// 新闻url
url = url.trim();
if(titleStatisticInfoMap.containsKey(title)){
NewsTitleInfo nti = titleStatisticInfoMap.get(title);
int total = nti.getTotal() + 1;
nti.setTotal(total);
if(tempIsCredible.equals("yes")){
if(nti.getIsCredible().equals("no")){
nti.setIsCredible(tempIsCredible);
}
}
titleStatisticInfoMap.put(title, nti);
}else{
NewsTitleInfo nti = new NewsTitleInfo();
nti.setTitle(title);
nti.setUrl(url);
nti.setTotal(1);
nti.setIsCredible(tempIsCredible);
titleStatisticInfoMap.put(title, nti);
}
return true;
}
public Map<String, NewsTitleInfo> terminatePartial() {
return titleStatisticInfoMap;
}
// reduce阶段
// 接收terminatePartial的返回结果
public boolean merge(Map<String, NewsTitleInfo> other) {
// 为null则不进行任何处理
if(other == null){
return true;
}
for(String key : other.keySet()){
String title = key;
<span style="color:#FF0000;">// NewsTitleInfo nti = other.get(title); 这样会出现key 与 value 对应不上的情形 nti会被之前的覆盖掉
NewsTitleInfo nti = new NewsTitleInfo();</span>
nti.setTitle(other.get(title).getTitle());
nti.setTotal(other.get(title).getTotal());
nti.setUrl(other.get(title).getUrl());
nti.setIsCredible(other.get(title).getIsCredible());
if(titleStatisticInfoMap.containsKey(title)){
NewsTitleInfo temp_nti = titleStatisticInfoMap.get(title);
int total = temp_nti.getTotal() + nti.getTotal();
temp_nti.setTotal(total);
titleStatisticInfoMap.put(title, temp_nti);
}else{
titleStatisticInfoMap.put(title, nti);
}
}
return true;
}
// 返回最终结果
public String terminate(){
StringBuilder sb = new StringBuilder();
for(String title : titleStatisticInfoMap.keySet()){
//sb.append("title:" + title);
//sb.append("\t");
sb.append(titleStatisticInfoMap.get(title).toString());
sb.append("\n");
}
return sb.toString();
}
}
}
hivesql:
CLASSIFIER_JAR="/test/Jar_GetDataFromHive/getdatafromhive-0.0.1-SNAPSHOT.jar"
hive -e "add jars $CLASSIFIER_JAR;
create temporary function titlestatistic as 'com.test.newsevent.GetNewsTitleInfoStatistics';
set mapred.reduce.tasks=100;
set hive.map.aggr=true;
set mapred.job.priority=NORMAL;
use dmm;
select t.titleinfo
from
(
select id, titlestatistic(title, author, url) as titleinfo
from
dmm.es_portals_web_data where datediff('2016-09-29' ,substr(publish_time_string, 0, 10)) <7 and datediff('2016-09-29' ,substr(publish_time_string, 0, 10)) >=0 <span style="color:#FF0000;">group by id</span>
) t where t.titleinfo is not NULL and length(t.titleinfo) > 0;">./timeresult_temp.txt
1.init()函数里面一定要初始化 后续用到的变量,并且构造函数要调用init()方法;
2.152行:NewsTitleInfo nti = other.get(title); 这样会出现key 与 value 对应不上的情形 nti会被之前的覆盖掉
必须要重新new 一个对象 NewsTitleInfo nti = new NewsTitleInfo(); 这里原理不太清楚,应该是hadoop机制造成的。
3.类GetNewsTitleInfoStatistics本质是一个自定义聚合函数,所以在写sql的时候,需要使用group by。
4.本例子是group by id, 每个id产生一条结果。(一个id对应多个标题统计信息,但是实际上是一行结果)
5.变量 titleStatisticInfoMap 在iterate的时候,将每条结果记录其中,然而在merge的时候,还是使用它,我的个人理解是由于这两个阶段分别在mapper和reducer阶段,在每个阶段,都会重新调用init方法,初始化titleStatisticInfoMap,并不是同一个变量,只是共用名字而已。
参考资料:
http://blog.csdn.net/duan19056/article/details/18002295
http://p-x1984.iteye.com/blog/1156392
http://blog.csdn.net/wisgood/article/details/26167367