数据
{“movie”:”1193”,”rate”:”5”,”timeStamp”:”978300760”,”uid”:”1”}
{“movie”:”1193”,”rate”:”4”,”timeStamp”:”978300760”,”uid”:”1”}
{“movie”:”1193”,”rate”:”2”,”timeStamp”:”978300760”,”uid”:”1”}
{“movie”:”1193”,”rate”:”1”,”timeStamp”:”978300760”,”uid”:”1”}
{“movie”:”661”,”rate”:”3”,”timeStamp”:”978302109”,”uid”:”1”}
{“movie”:”3114”,”rate”:”4”,”timeStamp”:”978302174”,”uid”:”1”}
需求
求出每个用户评分高低的前20条数据,将评分按从高到低排序
需求分析
1.将数据中各属性封装到MovieBean中,将各个属性进行序列化和反序列化
2.Map将数据以json格式读出,并将数据处理为(key,value)–> (MovieBean,NullWritable)输出
map阶段输出的key是可排序,自定义的bean想按照自己的规则进行排序,需要实现WritableComparator接口
3.自定义分区: Patitioner.getPartition(k,v,numpatition)分区 按照一定的规则把‘相同’的数据放到同一台reduce上
4:自定义分组: 继承WritableComparetor接口 .compare方法比较key 构造方法里面传递你真实的对象类型
5:Reduce遍历输出每个组的前20条数据
代码
MovieBean
private int rate;
private String timeStamp;
private String uid;
public String getMovie() {
return movie;
}
public void setMovie(String movie) {
this.movie = movie;
}
public int getRate() {
return rate;
}
public void setRate(int rate) {
this.rate = rate;
}
public String getTimeStamp() {
return timeStamp;
}
public void setTimeStamp(String timeStamp) {
this.timeStamp = timeStamp;
}
public String getUid() {
return uid;
}
public void setUid(String uid) {
this.uid = uid;
}
@Override
public String toString() {
return "MovieBean [movie=" + movie + ", rate=" + rate + ", timeStamp=" + timeStamp + ", uid=" + uid + "]";
}
//反序列化
@Override
public void readFields(DataInput in) throws IOException {
movie = in.readUTF();
rate = in.readInt();
timeStamp = in.readUTF();
uid = in.readUTF();
}
//序列化
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(movie);
out.writeInt(rate);
out.writeUTF(timeStamp);
out.writeUTF(uid);
}
//排序,先按照UID进行排序,UID相同的按照评分排序
@Override
public int compareTo(MovieBean o) {
if(o.getUid().compareTo(this.getUid())==0) {
return o.getRate()-this.getRate();
}else {
return o.getUid().compareTo(this.getUid());
}
}
自定义分区
public class MyPartition extends Partitioner<MovieBean, NullWritable>{
@Override
public int getPartition(MovieBean key, NullWritable value, int numPartitions) {
//使分区编号为正值,即reduce编号
return (key.getUid().hashCode() & Integer.MAX_VALUE)%numPartitions;
}
}
自定义分组
public class MyGroup extends WritableComparator{
public MyGroup(){
//传递真实对象类型
super(MovieBean.class,true);
}
/**
*比较UID,相同的UID归为同一组
**/
@Override
public int compare(WritableComparable a, WritableComparable b) {
MovieBean movieBean1 = (MovieBean) a;
MovieBean movieBean2 = (MovieBean) b;
return movieBean1.getUid().compareTo(movieBean2.getUid());
}
}
MapReduce部分
public class TopN3 {
public static class MapTask extends Mapper<LongWritable, Text, MovieBean, NullWritable>{
@Override
protected void map(LongWritable key, Text value,
Mapper<LongWritable, Text, MovieBean, NullWritable>.Context context)
throws IOException, InterruptedException {
//将数据以json格式读出
ObjectMapper mapper = new ObjectMapper();
MovieBean movieBean = mapper.readValue(value.toString(), MovieBean.class);
//将每条数据依次写入
context.write(movieBean, NullWritable.get());
}
}
public static class ReduceTask extends Reducer<MovieBean, NullWritable, MovieBean, NullWritable>{
@Override
protected void reduce(MovieBean key, Iterable<NullWritable> values,
Reducer<MovieBean, NullWritable, MovieBean, NullWritable>.Context context)
throws IOException, InterruptedException {
int num = 0;
//循环输出每个分组前20条数据
for (NullWritable nullWritable : values) {
if(num>=20) {
break;
}else {
context.write(key, NullWritable.get());
num++;
}
}
}
}
public static void main(String[] args) throws Exception{
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "topn3");
//设置map和reduce,以及提交的jar
job.setMapperClass(MapTask.class);
job.setReducerClass(ReduceTask.class);
job.setJarByClass(TopN3.class);
//设置使用几台reduce
job.setNumReduceTasks(2);
//设置自定义分区
job.setPartitionerClass(MyPartition.class);
//设置自定义分组
job.setGroupingComparatorClass(MyGroup.class);
//设置输入输出类型
job.setMapOutputKeyClass(MovieBean.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(MovieBean.class);
job.setOutputValueClass(NullWritable.class);
//输入和输出目录
FileInputFormat.addInputPath(job, new Path("D:\\a\\movie.txt"));
FileOutputFormat.setOutputPath(job, new Path("D:\\a\\MovieTopN3out"));
//判断文件是否存在
File file = new File("D:\\a\\MovieTopN3out");
if(file.exists()){
FileUtils.deleteDirectory(file);
}
//提交任务
boolean completion = job.waitForCompletion(true);
System.out.println(completion?"你很优秀!!!":"滚去调bug!!");
}
}
*自定义排序的好处是避免了资源和空间的浪费,MapReduce框架自带分区、分组、排序,使用TreeSet排序没有用到MapReduce框架的排序,造成浪费。