题目:使用UDF函数统计出flow.dat日志文件当中每个网站的浏览次数

 

 

一:编写MapReduce程序清洗数据

  我们需要的是统计日志文件中每个网站的浏览次数,为了方便起见,我们只取网站这一列数据。取出网站数据的这一操作就在map中进行,在reduce中无需对数据做处理。

源码:

MyMapper类

package com.WebsiteCount;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.util.StringUtils;




public class MyMapper extends Mapper<LongWritable,Text,LongWritable,Text> {

	@Override
	protected void map(LongWritable key, Text value,Context context) throws IOException, InterruptedException {
		String data = value.toString();
        String[] splitedData = data.split("\t");
        Text outValue=new Text(splitedData[12]);
        context.write(key,outValue);
        System.out.println("Mapper输出<"+key.toString()+","+outValue.toString()+">");
	}

}

MyReduce类

package com.WebsiteCount;

import java.io.IOException;
import java.util.List;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.util.StringUtils;



public class MyReduce extends Reducer<LongWritable,Text,Text,NullWritable>{
	private MultipleOutputs<Text,Text> mos;  
	
	@Override
	 protected void reduce(LongWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException{
			for(Text value : values) {
				System.out.println("reduce输入键值对<"+key.toString()+","+value.toString()+">");
				context.write(value, NullWritable.get());
				System.out.println("reduce输出键值对<"+value.toString()+",  ");
			}
	    }

	
}

驱动类  Website.java

package com.WebsiteCount;

import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

public class Website{

	static final String INPUT_PATH = "hdfs://192.xxx.xx.xxx:9000/webcount/in";
	static final String OUT_PATH = "hdfs://192.xxx.xx.xxx:9000/webcount/out";
	
	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		final FileSystem fileSystem = FileSystem.get(new URI(INPUT_PATH), conf);
		final Path outPath = new Path(OUT_PATH);
		if(fileSystem.exists(outPath)){
			fileSystem.delete(outPath, true);
		}
		
		final Job job = new Job(conf, Website.class.getSimpleName());
//        job.setJarByClass(Website.class);
		//1.1指定读取的文件位于哪里
		FileInputFormat.setInputPaths(job, INPUT_PATH);
		//指定如何对输入文件进行格式化,把输入文件每一行解析成键值对
		job.setInputFormatClass(TextInputFormat.class);
		
		//1.2 指定自定义的map类
		job.setMapperClass(MyMapper.class);
		//map输出的<k,v>类型。
		job.setMapOutputKeyClass(LongWritable.class);
		job.setMapOutputValueClass(Text.class);
	
		//2.2 指定自定义reduce类
		job.setReducerClass(MyReduce.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(NullWritable.class);
//		LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);
		//2.3 指定写出到哪里
		FileOutputFormat.setOutputPath(job, outPath);
		
		//把job提交给JobTracker运行
		job.waitForCompletion(true);

   }
}

接下来,进入linux下面的/usr/local目录下:

在hdfs中创建目录:

hdfs dfs -mkdir -p /webcount/in

将原始文件flow.dat日志文件上传到hdfs 中的webcount/in目录下:

在eclipse中运行上述程序,部分结果如下:

在hdfs上面查看是否成功:

hdfs dfs -ls  /webcount/out

在linux上面查看返回否成功:

hdfs dfs -cat  /webcount/out/part-r-00000

部分结果如下:

二:使用hive 中UDF统计出每个网站在日志文件中的浏览次数

进入hive,建表:

create table webcount(web,string)

    > ROW FORMAT DELIMITED

    > FIELDS TERMINATED BY ','

    > STROED AS TEXTFILE;

向创建的表中导入数据:

load data inpath '/webcount/out/part-r-00000' into table webcount;

查看表:

编写UDF函数阶段:

思路:

先自定义一个UDAF,它是多输入一条输出的聚合,所以结果拼成字符串输出:public class Top4GroupBy extends UDAF

首先是定义一个对象用来存储数据: public static class State

注意,在累加数据时需要判断map的key中是否存在该字符串,如果存在累加,不存在放入map中(重点)。

还需要自定义一个UDTF,支持一个输入多个输出。安装分隔符将字符串切分,将字符串转化为多行的列表输出:public class ExplodeMap extends GenericUDTF

最后:

这两个函数分别以top_group和explode_map为函数名加入到hive函数库中,来自网络的应用例子如下(获取前100个landingrefer的top url 100):

hive -e "select t.landingrefer, mytable.col1, mytable.col2,mytable.col3 from (select landingrefer, top_group(url,100) pro, count(sid) s from pvlog  where dt=20120719 and depth=1 group by landingrefer order by s desc limit 100) t lateral view explode_map(t.pro) mytable as col1, col2, col3;"> test

编写UTAF  GroupBy.java

package com.hive;

import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

import org.apache.hadoop.hive.ql.exec.UDAF;
import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;

public class GroupBy extends UDAF{

    //定义一个对象用于存储数据
    public static class State {
        private Map<Text, IntWritable> counts;
    }

    /**
     * 累加数据,判断map的key中是否存在该字符串,如果存在累加,不存在放入map中
     * @param s
     * @param o
     * @param i
     */
    private static void increment(State s, Text o, int i) {
        if (s.counts == null) {
            s.counts = new HashMap<Text, IntWritable>();
        }
        IntWritable count = s.counts.get(o);
        if (count == null) {
            Text key = new Text();
            key.set(o);
            s.counts.put(key, new IntWritable(i));
        } else {
            count.set(count.get() + i);
        }
    }
    public static class GroupByEvaluator implements UDAFEvaluator {
        private final State state;

        public GroupByEvaluator() {
            state = new State();
        }

        public void init() {
            if (state.counts != null) {
                state.counts.clear();
            }
        }

        public boolean iterate(Text value) {
            if (value == null) {
                return false;
            } else {
                increment(state, value, 1);
            }
            return true;
        }

        public State terminatePartial() {
            return state;
        }

        public boolean merge(State other) {
            if (state == null || other == null) {
                return false;
            }
            for (Map.Entry<Text, IntWritable> e : other.counts.entrySet()) {
                increment(state, e.getKey(), e.getValue().get());
            }
            return true;
        }

        public Text terminate() {
            if (state == null || state.counts.size() == 0) {
                return null;
            }
            Map<Text, IntWritable> it = sortByValue(state.counts, true);
            StringBuffer str = new StringBuffer();
            int i = 0;
            for (Map.Entry<Text, IntWritable> e : it.entrySet()) {
                ++i;
                str.append(e.getKey().toString()).append("$@").append(e.getValue().get()).append("$*");
            }
            return new Text(str.toString());
        }

        /*
         * 实现一个map按值的排序算法
         */
        @SuppressWarnings("unchecked")
        public static Map sortByValue(Map map, final boolean reverse) {
            List list = new LinkedList(map.entrySet());
            Collections.sort(list, new Comparator() {
                public int compare(Object o1, Object o2) {
                    if (reverse) {
                        return -((Comparable) ((Map.Entry) o1).getValue()).compareTo(((Map.Entry) o2).getValue());
                    }
                    return ((Comparable) ((Map.Entry) o1).getValue()).compareTo(((Map.Entry) o2).getValue());
                }
            });

            Map result = new LinkedHashMap();
            for (Iterator it = list.iterator(); it.hasNext();) {
                Map.Entry entry = (Map.Entry) it.next();
                result.put(entry.getKey(), entry.getValue());  
            }
            return result;
        }
    }
}

编写UDTF  SplitResult.jav

package com.hive;

import java.util.ArrayList;

import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;

public class SplitResult extends GenericUDTF {

    @Override
    public void close() throws HiveException {
    }
 // 该方法指定输入输出参数:输入的Object Inspectors和输出的Struct。
 //返回UDTF的返回行的信息(返回个数,类型)
    @Override
    public StructObjectInspector initialize(ObjectInspector[] args) throws UDFArgumentException {
        if (args.length != 1) {
            throw new UDFArgumentLengthException("ExplodeMap takes only one argument");
        }//getCategory()可以获得分类所有信息,返回与查询参数相匹配的类别对象数组。
        if (args[0].getCategory() != ObjectInspector.Category.PRIMITIVE) {
            throw new UDFArgumentException("ExplodeMap takes string as a parameter");
        }
        ArrayList<String> fieldNames = new ArrayList<String>();
        ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
        fieldNames.add("col1");
        fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
        fieldNames.add("col2");
        fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
        
        return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
    }

 // 该方法对传入的参数进行处理,处理输入记录,然后通过forward()方法返回输出结果。 
    @Override
    public void process(Object[] args) throws HiveException {
        String input = args[0].toString();
        String[] test = input.split("\\$\\*");
        for (int i = 0; i < test.length; i++) {
            try {
                String[] result  = new String[2];
                String[] sp= test[i].split("\\$\\@");
                result[0] =sp[0];
                result[1] =sp[1];
                //调用父类的forward方法进行数据的写出
                forward(result);
            } catch (Exception e) {
                continue;
            }
        }
    }
}

将UDF添加到hive中:

将编写的UDF打成jar包。

右击项目名,Export

设置jar包输出路径及jar包名。

打好的jar如下:

将jar包传到linux上

通过winSCP工具将打好的jar包上传到linux的/usr/local目录下:(下图为winscp工具截图)

将打好的jar包添加到hive中:

在hive-site.xml文件中添加以下内容:

在hive中注册函数:

create function group_by as 'com.yjw.hive.GroupBy';

create function splitrs as 'com.yjw.hive.SplitResult';

使用注册的函数查询

select webtimes.web,webtimes.times from (select group_by(web)pro from webcount) t lateral view splitrs(t.pro) webtimes as web,times;

结果部分如下:

实验结束。

 

 

 

 

 

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值