题目：使用UDF函数统计出flow.dat日志文件当中每个网站的浏览次数

最新推荐文章于 2024-01-18 20:15:35 发布

Ynzo

最新推荐文章于 2024-01-18 20:15:35 发布

阅读量369

点赞数

分类专栏：实验文章标签： UDF示例

本文链接：https://blog.csdn.net/weixin_36836847/article/details/85680272

版权

实验专栏收录该内容

1 篇文章 0 订阅

订阅专栏

一：编写MapReduce程序清洗数据

我们需要的是统计日志文件中每个网站的浏览次数，为了方便起见，我们只取网站这一列数据。取出网站数据的这一操作就在map中进行，在reduce中无需对数据做处理。

源码：

MyMapper类

package com.WebsiteCount;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.util.StringUtils;




public class MyMapper extends Mapper<LongWritable,Text,LongWritable,Text> {

	@Override
	protected void map(LongWritable key, Text value,Context context) throws IOException, InterruptedException {
		String data = value.toString();
        String[] splitedData = data.split("\t");
        Text outValue=new Text(splitedData[12]);
        context.write(key,outValue);
        System.out.println("Mapper输出<"+key.toString()+","+outValue.toString()+">");
	}

}

MyReduce类

package com.WebsiteCount;

import java.io.IOException;
import java.util.List;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.util.StringUtils;



public class MyReduce extends Reducer<LongWritable,Text,Text,NullWritable>{
	private MultipleOutputs<Text,Text> mos;  
	
	@Override
	 protected void reduce(LongWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException{
			for(Text value : values) {
				System.out.println("reduce输入键值对<"+key.toString()+","+value.toString()+">");
				context.write(value, NullWritable.get());
				System.out.println("reduce输出键值对<"+value.toString()+",  ");
			}
	    }

	
}

驱动类 Website.java

package com.WebsiteCount;

import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

public class Website{

	static final String INPUT_PATH = "hdfs://192.xxx.xx.xxx:9000/webcount/in";
	static final String OUT_PATH = "hdfs://192.xxx.xx.xxx:9000/webcount/out";
	
	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		final FileSystem fileSystem = FileSystem.get(new URI(INPUT_PATH), conf);
		final Path outPath = new Path(OUT_PATH);
		if(fileSystem.exists(outPath)){
			fileSystem.delete(outPath, true);
		}
		
		final Job job = new Job(conf, Website.class.getSimpleName());
//        job.setJarByClass(Website.class);
		//1.1指定读取的文件位于哪里
		FileInputFormat.setInputPaths(job, INPUT_PATH);
		//指定如何对输入文件进行格式化，把输入文件每一行解析成键值对
		job.setInputFormatClass(TextInputFormat.class);
		
		//1.2 指定自定义的map类
		job.setMapperClass(MyMapper.class);
		//map输出的<k,v>类型。
		job.setMapOutputKeyClass(LongWritable.class);
		job.setMapOutputValueClass(Text.class);
	
		//2.2 指定自定义reduce类
		job.setReducerClass(MyReduce.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(NullWritable.class);
//		LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);
		//2.3 指定写出到哪里
		FileOutputFormat.setOutputPath(job, outPath);
		
		//把job提交给JobTracker运行
		job.waitForCompletion(true);

   }
}

接下来，进入linux下面的/usr/local目录下：

在hdfs中创建目录：

hdfs dfs -mkdir -p /webcount/in

将原始文件flow.dat日志文件上传到hdfs 中的webcount/in目录下：

在eclipse中运行上述程序，部分结果如下：

在hdfs上面查看是否成功：

hdfs dfs -ls /webcount/out

在linux上面查看返回否成功：

hdfs dfs -cat /webcount/out/part-r-00000

部分结果如下：

二：使用hive 中UDF统计出每个网站在日志文件中的浏览次数

进入hive，建表：

create table webcount(web,string)

> ROW FORMAT DELIMITED

> FIELDS TERMINATED BY ','

> STROED AS TEXTFILE;

向创建的表中导入数据：

load data inpath '/webcount/out/part-r-00000' into table webcount;

查看表：

编写UDF函数阶段：

思路：

先自定义一个UDAF，它是多输入一条输出的聚合，所以结果拼成字符串输出：public class Top4GroupBy extends UDAF

首先是定义一个对象用来存储数据： public static class State

注意，在累加数据时需要判断map的key中是否存在该字符串，如果存在累加，不存在放入map中（重点）。

还需要自定义一个UDTF，支持一个输入多个输出。安装分隔符将字符串切分，将字符串转化为多行的列表输出：public class ExplodeMap extends GenericUDTF

最后：

这两个函数分别以top_group和explode_map为函数名加入到hive函数库中，来自网络的应用例子如下（获取前100个landingrefer的top url 100）：

hive -e "select t.landingrefer, mytable.col1, mytable.col2,mytable.col3 from (select landingrefer, top_group(url，100) pro, count(sid) s from pvlog where dt=20120719 and depth=1 group by landingrefer order by s desc limit 100) t lateral view explode_map(t.pro) mytable as col1, col2, col3;"> test

编写UTAF GroupBy.java

package com.hive;

import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

import org.apache.hadoop.hive.ql.exec.UDAF;
import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;

public class GroupBy extends UDAF{

    //定义一个对象用于存储数据
    public static class State {
        private Map<Text, IntWritable> counts;
    }

    /**
     * 累加数据，判断map的key中是否存在该字符串，如果存在累加，不存在放入map中
     * @param s
     * @param o
     * @param i
     */
    private static void increment(State s, Text o, int i) {
        if (s.counts == null) {
            s.counts = new HashMap<Text, IntWritable>();
        }
        IntWritable count = s.counts.get(o);
        if (count == null) {
            Text key = new Text();
            key.set(o);
            s.counts.put(key, new IntWritable(i));
        } else {
            count.set(count.get() + i);
        }
    }
    public static class GroupByEvaluator implements UDAFEvaluator {
        private final State state;

        public GroupByEvaluator() {
            state = new State();
        }

        public void init() {
            if (state.counts != null) {
                state.counts.clear();
            }
        }

        public boolean iterate(Text value) {
            if (value == null) {
                return false;
            } else {
                increment(state, value, 1);
            }
            return true;
        }

        public State terminatePartial() {
            return state;
        }

        public boolean merge(State other) {
            if (state == null || other == null) {
                return false;
            }
            for (Map.Entry<Text, IntWritable> e : other.counts.entrySet()) {
                increment(state, e.getKey(), e.getValue().get());
            }
            return true;
        }

        public Text terminate() {
            if (state == null || state.counts.size() == 0) {
                return null;
            }
            Map<Text, IntWritable> it = sortByValue(state.counts, true);
            StringBuffer str = new StringBuffer();
            int i = 0;
            for (Map.Entry<Text, IntWritable> e : it.entrySet()) {
                ++i;
                str.append(e.getKey().toString()).append("$@").append(e.getValue().get()).append("$*");
            }
            return new Text(str.toString());
        }

        /*
         * 实现一个map按值的排序算法
         */
        @SuppressWarnings("unchecked")
        public static Map sortByValue(Map map, final boolean reverse) {
            List list = new LinkedList(map.entrySet());
            Collections.sort(list, new Comparator() {
                public int compare(Object o1, Object o2) {
                    if (reverse) {
                        return -((Comparable) ((Map.Entry) o1).getValue()).compareTo(((Map.Entry) o2).getValue());
                    }
                    return ((Comparable) ((Map.Entry) o1).getValue()).compareTo(((Map.Entry) o2).getValue());
                }
            });

            Map result = new LinkedHashMap();
            for (Iterator it = list.iterator(); it.hasNext();) {
                Map.Entry entry = (Map.Entry) it.next();
                result.put(entry.getKey(), entry.getValue());  
            }
            return result;
        }
    }
}

编写UDTF SplitResult.jav

package com.hive;

import java.util.ArrayList;

import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;

public class SplitResult extends GenericUDTF {

    @Override
    public void close() throws HiveException {
    }
 // 该方法指定输入输出参数：输入的Object Inspectors和输出的Struct。
 //返回UDTF的返回行的信息（返回个数，类型）
    @Override
    public StructObjectInspector initialize(ObjectInspector[] args) throws UDFArgumentException {
        if (args.length != 1) {
            throw new UDFArgumentLengthException("ExplodeMap takes only one argument");
        }//getCategory()可以获得分类所有信息,返回与查询参数相匹配的类别对象数组。
        if (args[0].getCategory() != ObjectInspector.Category.PRIMITIVE) {
            throw new UDFArgumentException("ExplodeMap takes string as a parameter");
        }
        ArrayList<String> fieldNames = new ArrayList<String>();
        ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
        fieldNames.add("col1");
        fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
        fieldNames.add("col2");
        fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
        
        return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
    }

 // 该方法对传入的参数进行处理，处理输入记录，然后通过forward()方法返回输出结果。 
    @Override
    public void process(Object[] args) throws HiveException {
        String input = args[0].toString();
        String[] test = input.split("\\$\\*");
        for (int i = 0; i < test.length; i++) {
            try {
                String[] result  = new String[2];
                String[] sp= test[i].split("\\$\\@");
                result[0] =sp[0];
                result[1] =sp[1];
                //调用父类的forward方法进行数据的写出
                forward(result);
            } catch (Exception e) {
                continue;
            }
        }
    }
}

将UDF添加到hive中：

将编写的UDF打成jar包。

右击项目名，Export