给定电影数据,求出每个电影得分排名前二十的数据
1.自定义Movie类
想要在MapReduce类输入输出自定义类型,需要实现Writable类
想要对自定义类输出排序,需要实现WritableComparable类,自定义排序规则
package nue.edu.ls;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
/**
* Writable hadoop 序列化接口
* 能够排序
* @author root
*
*/
public class MovieBean implements WritableComparable<MovieBean>{
//{"movie":"1193","rate":"5","timeStamp":"978300760","uid":"1"}
private String movie;
private int rate;
private String timeStamp;
private String uid;
@Override
public int compareTo(MovieBean o) {
if(o.getMovie().compareTo(this.getMovie())==0){
return o.getRate() - this.getRate();
}else{
return o.getMovie().compareTo(this.getMovie());
}
//return 0;
}
public void set(String movie,int rate,String timeStamp,String uid){
this.movie = movie;
this.rate = rate;
this.timeStamp = timeStamp;
this.uid = uid;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(movie);
out.writeInt(rate);
out.writeUTF(timeStamp);
out.writeUTF(uid);
}
@Override
public void readFields(DataInput in) throws IOException {
movie = in.readUTF();
rate = in.readInt();
timeStamp = in.readUTF();
uid = in.readUTF();
}
public String getMovie() {
return movie;
}
public void setMovie(String movie) {
this.movie = movie;
}
public int getRate() {
return rate;
}
public void setRate(int rate) {
this.rate = rate;
}
public String getTimeStamp() {
return timeStamp;
}
public void setTimeStamp(String timeStamp) {
this.timeStamp = timeStamp;
}
public String getUid() {
return uid;
}
public void setUid(String uid) {
this.uid = uid;
}
@Override
public String toString() {
return "MovieBean [movie=" + movie + ", rate=" + rate + ", timeStamp=" + timeStamp + ", uid=" + uid + "]";
}
public void set(MovieBean movieBean) {
this.movie = movieBean.getMovie();
this.rate = movieBean.getRate();
this.timeStamp = movieBean.getTimeStamp();
this.uid = movieBean.getUid();
}
}
2.Partitioner类
用于决定map提交到哪个reduce任务。
在本次代码中,求每个电影前二十位的数据。如果不设置,同一个电影的数据很可能被提交到两个不同的reduce中,会得出2*20条数据,违背了需求。
需自定义一个类,继承Partitioner类。根据getPartitioner的返回值0或者1,决定分配到哪个reduce,避免上面所说的情况。
(key.getMovie().hashCode() & Integer.MAX_VALUE)%numPartitions; 根据MovieId的hashCode值与reduce个数取余,结果不是1就是2,很好的分配了reduce。其中 &Integer.MAX_VALUE保证了hashCode不为负数
设置Partitioner类,还需在提交任务的类中设置job.setPartitionerClass(MyPartition.class)
package nue.edu.ls;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Partitioner;
/**
* 分区的:
* 把想要的数据分到相同的reduce里面
* @author root
*
*/
public class MyPartition extends Partitioner<MovieBean, NullWritable>{
/**
* munPartitions 代表有多少个reducetask
* key map端输出的key
* value map端输出的value
*/
@Override
public int getPartition(MovieBean key, NullWritable value, int numPartitions) {
//防止hashCode是负数
return (key.getMovie().hashCode() & Integer.MAX_VALUE)%numPartitions;
}
}
3.Group
设置Group类,是为了避免混乱分组的情况。
比如(a,1),(a,1)两个输入,reduce会识别到key为字符a,将其分为一组
然而(Bean,1)(Bean,1)两个输入,两个自定义类型reduce并不会认为相等,分为一组。
因为在Java中,两个new Bean(),即使内容完全相同,地址值也是不同的,对象之间的比较并不等同于数据类型的比较
在本次代码中,不做Group类,每一个MovieBean都会被认为是不同对的一组,最会会输出所有的MovieBean,而不是前二十位
自定义Group类需继承WritableComparator类且需对无参构造进行改动
package nue.edu.ls;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
/**
* 分组:movieid相同的分到一起
*
*/
public class MyGroup extends WritableComparator{
public MyGroup() {
super(MovieBean.class,true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
MovieBean bean1 = (MovieBean)a;
MovieBean bean2 = (MovieBean)b;
return bean1.getMovie().compareTo(bean2.getMovie());
}
}
4.任务类
package nue.edu.ls;
import java.io.File;
import java.io.IOException;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.codehaus.jackson.map.ObjectMapper;
import com.alibaba.fastjson.JSON;
public class TopN3 {
public static class MapTask extends Mapper<LongWritable, Text, MovieBean, NullWritable>{
@Override
protected void map(LongWritable key, Text value,
Mapper<LongWritable, Text, MovieBean, NullWritable>.Context context)
throws IOException, InterruptedException {
try {
ObjectMapper mapper = new ObjectMapper();
//JSON.parseObject(text, clazz)
MovieBean bean = mapper.readValue(value.toString(), MovieBean.class);
context.write(bean, NullWritable.get());
} catch (Exception e) {
}
}
}
public static class ReduceTask extends Reducer<MovieBean, NullWritable, MovieBean, NullWritable>{
@Override
protected void reduce(MovieBean key, Iterable<NullWritable> values,
Reducer<MovieBean, NullWritable, MovieBean, NullWritable>.Context context)
throws IOException, InterruptedException {
int num = 0;
//虽然是一个空的,但是key能够根据迭代进行相应的得到对应空值的结果
for (NullWritable nullWritable : values) {
if(num>=20){
break;
}
num++;
context.write(key, NullWritable.get());
}
}
}
public static void main(String[] args) throws Exception{
System.setProperty("HADDOP_USER_NAME", "root");
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "topn3");
//设置map和reduce,以及提交的jar
job.setMapperClass(MapTask.class);
job.setReducerClass(ReduceTask.class);
job.setJarByClass(TopN3.class);
job.setNumReduceTasks(2);
job.setPartitionerClass(MyPartition.class);
job.setGroupingComparatorClass(MyGroup.class);
//设置输入输出类型
job.setMapOutputKeyClass(MovieBean.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(MovieBean.class);
job.setOutputValueClass(NullWritable.class);
//输入和输出目录
FileInputFormat.addInputPath(job, new Path("d:/rating.json"));
FileOutputFormat.setOutputPath(job, new Path("d:\\data\\out\\topN3"));
//判断文件是否存在
File file = new File("d:\\data\\out\\topN3");
if(file.exists()){
FileUtils.deleteDirectory(file);
}
//提交任务
boolean completion = job.waitForCompletion(true);
System.out.println(completion?"你很优秀!!!":"滚去调bug!!");
}
}
5.结果