一:编写MapReduce程序清洗数据
我们需要的是统计日志文件中每个网站的浏览次数,为了方便起见,我们只取网站这一列数据。取出网站数据的这一操作就在map中进行,在reduce中无需对数据做处理。
源码:
MyMapper类
package com.WebsiteCount;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.util.StringUtils;
public class MyMapper extends Mapper<LongWritable,Text,LongWritable,Text> {
@Override
protected void map(LongWritable key, Text value,Context context) throws IOException, InterruptedException {
String data = value.toString();
String[] splitedData = data.split("\t");
Text outValue=new Text(splitedData[12]);
context.write(key,outValue);
System.out.println("Mapper输出<"+key.toString()+","+outValue.toString()+">");
}
}
MyReduce类
package com.WebsiteCount;
import java.io.IOException;
import java.util.List;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.util.StringUtils;
public class MyReduce extends Reducer<LongWritable,Text,Text,NullWritable>{
private MultipleOutputs<Text,Text> mos;
@Override
protected void reduce(LongWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException{
for(Text value : values) {
System.out.println("reduce输入键值对<"+key.toString()+","+value.toString()+">");
context.write(value, NullWritable.get());
System.out.println("reduce输出键值对<"+value.toString()+", ");
}
}
}
驱动类 Website.java
package com.WebsiteCount;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class Website{
static final String INPUT_PATH = "hdfs://192.xxx.xx.xxx:9000/webcount/in";
static final String OUT_PATH = "hdfs://192.xxx.xx.xxx:9000/webcount/out";
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
final FileSystem fileSystem = FileSystem.get(new URI(INPUT_PATH), conf);
final Path outPath = new Path(OUT_PATH);
if(fileSystem.exists(outPath)){
fileSystem.delete(outPath, true);
}
final Job job = new Job(conf, Website.class.getSimpleName());
// job.setJarByClass(Website.class);
//1.1指定读取的文件位于哪里
FileInputFormat.setInputPaths(job, INPUT_PATH);
//指定如何对输入文件进行格式化,把输入文件每一行解析成键值对
job.setInputFormatClass(TextInputFormat.class);
//1.2 指定自定义的map类
job.setMapperClass(MyMapper.class);
//map输出的<k,v>类型。
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(Text.class);
//2.2 指定自定义reduce类
job.setReducerClass(MyReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
// LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);
//2.3 指定写出到哪里
FileOutputFormat.setOutputPath(job, outPath);
//把job提交给JobTracker运行
job.waitForCompletion(true);
}
}
接下来,进入linux下面的/usr/local目录下:
在hdfs中创建目录:
hdfs dfs -mkdir -p /webcount/in
将原始文件flow.dat日志文件上传到hdfs 中的webcount/in目录下:
在eclipse中运行上述程序,部分结果如下:
在hdfs上面查看是否成功:
hdfs dfs -ls /webcount/out
在linux上面查看返回否成功:
hdfs dfs -cat /webcount/out/part-r-00000
部分结果如下:
二:使用hive 中UDF统计出每个网站在日志文件中的浏览次数
进入hive,建表:
create table webcount(web,string)
> ROW FORMAT DELIMITED
> FIELDS TERMINATED BY ','
> STROED AS TEXTFILE;
向创建的表中导入数据:
load data inpath '/webcount/out/part-r-00000' into table webcount;
查看表:
编写UDF函数阶段:
思路:
先自定义一个UDAF,它是多输入一条输出的聚合,所以结果拼成字符串输出:public class Top4GroupBy extends UDAF
首先是定义一个对象用来存储数据: public static class State
注意,在累加数据时需要判断map的key中是否存在该字符串,如果存在累加,不存在放入map中(重点)。
还需要自定义一个UDTF,支持一个输入多个输出。安装分隔符将字符串切分,将字符串转化为多行的列表输出:public class ExplodeMap extends GenericUDTF
最后:
这两个函数分别以top_group和explode_map为函数名加入到hive函数库中,来自网络的应用例子如下(获取前100个landingrefer的top url 100):
hive -e "select t.landingrefer, mytable.col1, mytable.col2,mytable.col3 from (select landingrefer, top_group(url,100) pro, count(sid) s from pvlog where dt=20120719 and depth=1 group by landingrefer order by s desc limit 100) t lateral view explode_map(t.pro) mytable as col1, col2, col3;"> test
编写UTAF GroupBy.java
package com.hive;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.hive.ql.exec.UDAF;
import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
public class GroupBy extends UDAF{
//定义一个对象用于存储数据
public static class State {
private Map<Text, IntWritable> counts;
}
/**
* 累加数据,判断map的key中是否存在该字符串,如果存在累加,不存在放入map中
* @param s
* @param o
* @param i
*/
private static void increment(State s, Text o, int i) {
if (s.counts == null) {
s.counts = new HashMap<Text, IntWritable>();
}
IntWritable count = s.counts.get(o);
if (count == null) {
Text key = new Text();
key.set(o);
s.counts.put(key, new IntWritable(i));
} else {
count.set(count.get() + i);
}
}
public static class GroupByEvaluator implements UDAFEvaluator {
private final State state;
public GroupByEvaluator() {
state = new State();
}
public void init() {
if (state.counts != null) {
state.counts.clear();
}
}
public boolean iterate(Text value) {
if (value == null) {
return false;
} else {
increment(state, value, 1);
}
return true;
}
public State terminatePartial() {
return state;
}
public boolean merge(State other) {
if (state == null || other == null) {
return false;
}
for (Map.Entry<Text, IntWritable> e : other.counts.entrySet()) {
increment(state, e.getKey(), e.getValue().get());
}
return true;
}
public Text terminate() {
if (state == null || state.counts.size() == 0) {
return null;
}
Map<Text, IntWritable> it = sortByValue(state.counts, true);
StringBuffer str = new StringBuffer();
int i = 0;
for (Map.Entry<Text, IntWritable> e : it.entrySet()) {
++i;
str.append(e.getKey().toString()).append("$@").append(e.getValue().get()).append("$*");
}
return new Text(str.toString());
}
/*
* 实现一个map按值的排序算法
*/
@SuppressWarnings("unchecked")
public static Map sortByValue(Map map, final boolean reverse) {
List list = new LinkedList(map.entrySet());
Collections.sort(list, new Comparator() {
public int compare(Object o1, Object o2) {
if (reverse) {
return -((Comparable) ((Map.Entry) o1).getValue()).compareTo(((Map.Entry) o2).getValue());
}
return ((Comparable) ((Map.Entry) o1).getValue()).compareTo(((Map.Entry) o2).getValue());
}
});
Map result = new LinkedHashMap();
for (Iterator it = list.iterator(); it.hasNext();) {
Map.Entry entry = (Map.Entry) it.next();
result.put(entry.getKey(), entry.getValue());
}
return result;
}
}
}
编写UDTF SplitResult.jav
package com.hive;
import java.util.ArrayList;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
public class SplitResult extends GenericUDTF {
@Override
public void close() throws HiveException {
}
// 该方法指定输入输出参数:输入的Object Inspectors和输出的Struct。
//返回UDTF的返回行的信息(返回个数,类型)
@Override
public StructObjectInspector initialize(ObjectInspector[] args) throws UDFArgumentException {
if (args.length != 1) {
throw new UDFArgumentLengthException("ExplodeMap takes only one argument");
}//getCategory()可以获得分类所有信息,返回与查询参数相匹配的类别对象数组。
if (args[0].getCategory() != ObjectInspector.Category.PRIMITIVE) {
throw new UDFArgumentException("ExplodeMap takes string as a parameter");
}
ArrayList<String> fieldNames = new ArrayList<String>();
ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
fieldNames.add("col1");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
fieldNames.add("col2");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
}
// 该方法对传入的参数进行处理,处理输入记录,然后通过forward()方法返回输出结果。
@Override
public void process(Object[] args) throws HiveException {
String input = args[0].toString();
String[] test = input.split("\\$\\*");
for (int i = 0; i < test.length; i++) {
try {
String[] result = new String[2];
String[] sp= test[i].split("\\$\\@");
result[0] =sp[0];
result[1] =sp[1];
//调用父类的forward方法进行数据的写出
forward(result);
} catch (Exception e) {
continue;
}
}
}
}
将UDF添加到hive中:
将编写的UDF打成jar包。
右击项目名,Export
设置jar包输出路径及jar包名。
打好的jar如下:
将jar包传到linux上
通过winSCP工具将打好的jar包上传到linux的/usr/local目录下:(下图为winscp工具截图)
将打好的jar包添加到hive中:
在hive-site.xml文件中添加以下内容:
在hive中注册函数:
create function group_by as 'com.yjw.hive.GroupBy';
create function splitrs as 'com.yjw.hive.SplitResult';
使用注册的函数查询
select webtimes.web,webtimes.times from (select group_by(web)pro from webcount) t lateral view splitrs(t.pro) webtimes as web,times;
结果部分如下:
实验结束。