udaf实例

下面例子是自己在实验udaf写法,中间踩了不少坑,写在这里做个笔记

udaf类实现

package com.test.newsevent;

import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;

import org.apache.hadoop.hive.ql.exec.UDAF;
import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;

@SuppressWarnings("deprecation")
public class GetNewsTitleInfoStatistics extends UDAF {

	// 记录新闻标题统计信息
	public static class NewsTitleInfo implements Serializable {
		private String title;// 标题名称
		private String url;// url
		private String isCredible;// 新闻来源是否可信 "no":不可靠 "yes":可靠
		private int total;// 该条新闻的总数
		
		public NewsTitleInfo() {
			
		}
		
		public NewsTitleInfo(String title, int total, String isCredible, String url) {
			super();
			this.title = title;
			this.total = total;
			this.isCredible = isCredible;
			this.url = url;
		}
		
		public String getTitle() {
			return title;
		}

		public void setTitle(String title) {
			this.title = title;
		}

		public String getUrl() {
			return url;
		}

		public void setUrl(String url) {
			this.url = url;
		}

		public String getIsCredible() {
			return isCredible;
		}

		public void setIsCredible(String isCredible) {
			this.isCredible = isCredible;
		}

		public int getTotal() {
			return total;
		}

		public void setTotal(int total) {
			this.total = total;
		}

		public String toString(){
			return title + "#@#@" + total + "#@#@" + isCredible + "#@#@" + url;
		}
	}
	
	public static class Evaluator implements UDAFEvaluator{
		private Map<String, NewsTitleInfo> titleStatisticInfoMap;
		
		public Evaluator(){
			super();
			init();
		}
		
		// 初始化
		public void init() {
			titleStatisticInfoMap = new HashMap<String, NewsTitleInfo>();
		}
		
		// map阶段,遍历所有记录
		public boolean iterate(String title, String author, String url){
			if(title == null || author == null || url == null){
				return true;
			}
		
			// 新闻标题
			title = title.trim();
			title = title.replaceAll("·", "·");
			title = title.replaceAll(""", "\"");
			if(title.equals("")){
				return true;
			}
			
			// 新闻来源 author
			author = author.trim();
			
			// 判断该新闻是否来自指定站点新闻
			String tempIsCredible = "no";
			/*if(author.contains("新浪") || author.contains("搜狐")){
				tempIsCredible = "yes";
			}*/
			
			if(author.contains("时光网")){
				tempIsCredible = "yes";
			}
			
			if(tempIsCredible.equals("no")){
				return true;
			}
			
			// 新闻url
			url = url.trim();
			
			if(titleStatisticInfoMap.containsKey(title)){
				NewsTitleInfo nti = titleStatisticInfoMap.get(title);
				int total = nti.getTotal() + 1;
				nti.setTotal(total);
				if(tempIsCredible.equals("yes")){
					if(nti.getIsCredible().equals("no")){
						nti.setIsCredible(tempIsCredible);
					}
				}
				titleStatisticInfoMap.put(title, nti);
			}else{
				NewsTitleInfo nti = new NewsTitleInfo();
				nti.setTitle(title);
				nti.setUrl(url);
				nti.setTotal(1);
				nti.setIsCredible(tempIsCredible);
				titleStatisticInfoMap.put(title, nti);
			}
			
			return true;
		}
		
		public Map<String, NewsTitleInfo> terminatePartial() {
			return titleStatisticInfoMap;
		}
	
		// reduce阶段
		// 接收terminatePartial的返回结果
		public boolean merge(Map<String, NewsTitleInfo> other) {
			// 为null则不进行任何处理
			if(other == null){
				return true;
			}
			
			for(String key : other.keySet()){
				String title = key;
				<span style="color:#FF0000;">// NewsTitleInfo nti = other.get(title);  这样会出现key 与 value 对应不上的情形 nti会被之前的覆盖掉 
				NewsTitleInfo nti = new NewsTitleInfo();</span>
				nti.setTitle(other.get(title).getTitle());
				nti.setTotal(other.get(title).getTotal());
				nti.setUrl(other.get(title).getUrl());
				nti.setIsCredible(other.get(title).getIsCredible());
				
				if(titleStatisticInfoMap.containsKey(title)){
					NewsTitleInfo temp_nti = titleStatisticInfoMap.get(title);
					int total = temp_nti.getTotal() + nti.getTotal();
					temp_nti.setTotal(total);
					titleStatisticInfoMap.put(title, temp_nti);
				}else{
					titleStatisticInfoMap.put(title, nti);
				}
			}
			
			return true;
		}
		
		// 返回最终结果
		public String terminate(){
			StringBuilder sb = new StringBuilder();
			for(String title : titleStatisticInfoMap.keySet()){
					//sb.append("title:" + title);
					//sb.append("\t");
					sb.append(titleStatisticInfoMap.get(title).toString());
					sb.append("\n");
			}
			
			return sb.toString();
		}
		
	}

}

hivesql:

CLASSIFIER_JAR="/test/Jar_GetDataFromHive/getdatafromhive-0.0.1-SNAPSHOT.jar"
hive -e "add jars $CLASSIFIER_JAR;
create temporary function titlestatistic as 'com.test.newsevent.GetNewsTitleInfoStatistics';
set mapred.reduce.tasks=100;
set hive.map.aggr=true;
set mapred.job.priority=NORMAL;
use dmm;
select t.titleinfo
from 
(
select id, titlestatistic(title, author, url) as titleinfo
from 
dmm.es_portals_web_data where datediff('2016-09-29' ,substr(publish_time_string, 0, 10)) <7 and datediff('2016-09-29' ,substr(publish_time_string, 0, 10)) >=0 <span style="color:#FF0000;">group by id</span>
) t where t.titleinfo is not NULL and length(t.titleinfo) > 0;">./timeresult_temp.txt


1.init()函数里面一定要初始化 后续用到的变量,并且构造函数要调用init()方法;
2.152行:NewsTitleInfo nti = other.get(title);  这样会出现key 与 value 对应不上的情形 nti会被之前的覆盖掉   
         必须要重新new 一个对象 NewsTitleInfo nti = new NewsTitleInfo(); 这里原理不太清楚,应该是hadoop机制造成的。
3.类GetNewsTitleInfoStatistics本质是一个自定义聚合函数,所以在写sql的时候,需要使用group by。

4.本例子是group by id, 每个id产生一条结果。(一个id对应多个标题统计信息,但是实际上是一行结果)

5.变量 titleStatisticInfoMap 在iterate的时候,将每条结果记录其中,然而在merge的时候,还是使用它,我的个人理解是由于这两个阶段分别在mapper和reducer阶段,在每个阶段,都会重新调用init方法,初始化titleStatisticInfoMap,并不是同一个变量,只是共用名字而已。


参考资料:

http://blog.csdn.net/duan19056/article/details/18002295

http://p-x1984.iteye.com/blog/1156392

http://blog.csdn.net/wisgood/article/details/26167367

  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值