ps:
一、字符串比较用compareTo()时:
1,长度相同,从第一位开始比较,如果相同返回0,如果不同则马上返回这两个字符的ascii值的差值。
2,长度不同,直接返回长度差值。
二、Integer比较用compareTo()时:
1,对比数字时相同,返回0。
2,对比数字不同时,返回-1。
实现每个人最喜欢的电影topN,按照uid、分数进行排序
一、 RateBean
implements WritableComparable
重写的compareto方法,应该是在:排序的时候用到
package topn;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/**
* @Description
* @Author cqh <caoqingghai@1000phone.com>
* @Version V1.0
* @Since 1.0
* @Date 2019/4/15 09:48
*/
public class RateBean implements WritableComparable<RateBean> {
private String movie;
private String rate;
private String timeStamp;
private String uid;
public RateBean() {
}
public RateBean(String movie) {
this.movie = movie;
}
public String getMovie() {
return movie;
}
public void setMovie(String movie) {
this.movie = movie;
}
public String getRate() {
return rate;
}
public void setRate(String rate) {
this.rate = rate;
}
public String getTimeStamp() {
return timeStamp;
}
public void setTimeStamp(String timeStamp) {
this.timeStamp = timeStamp;
}
public String getUid() {
return uid;
}
public void setUid(String uid) {
this.uid = uid;
}
@Override
public int compareTo(RateBean o) {
if(this.uid.equals(o.uid)){
return -this.rate.compareTo(o.rate);
}else {
return this.uid.compareTo(o.uid);
}
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(movie);
out.writeUTF(rate);
out.writeUTF(timeStamp);
out.writeUTF(uid);
}
@Override
public String toString() {
return movie +"\t" + rate + "\t" + uid;
}
@Override
public void readFields(DataInput in) throws IOException {
this.movie = in.readUTF();
this.rate = in.readUTF();
this.timeStamp = in.readUTF();
this.uid = in.readUTF();
}
}
二、RatePartitioner
extends Partitioner
map端在分区的时候,根据uid的值hash分区器的数量,来确定分区
package topn;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Partitioner;
/**
* @Description
* @Author cqh <caoqingghai@1000phone.com>
* @Version V1.0
* @Since 1.0
* @Date 2019/4/15 10:17
*/
public class RatePartitioner extends Partitioner<RateBean, NullWritable> {
@Override
public int getPartition(RateBean rateBean, NullWritable nullWritable, int numPartitions) {
//rateBean.getUid().hashCode()&Integer.MAX_VALUE防止超出INT范围出现负数
return rateBean.getUid().hashCode()&Integer.MAX_VALUE%numPartitions;
}
}
三、RateGroupingComparable
extends WritableComparator
确保reduce端分组的时候,将uid相同的分到同一个组
没有这个的话,应该就是按照ratebean自己定义的那个comparto来确定规则,进行分组了。
package topn;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
/**
* @Description
* @Author cqh <caoqingghai@1000phone.com>
* @Version V1.0
* @Since 1.0
* @Date 2019/4/15 10:28
*/
public class RateGroupingComparable extends WritableComparator {
public RateGroupingComparable() {
super(RateBean.class,true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
RateBean bean1 = (RateBean)a;
RateBean bean2 = (RateBean)b;
return bean1.getUid().compareTo(bean2.getUid());
}
}
四、mapper
package topn;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.codehaus.jackson.map.ObjectMapper;
import java.io.IOException;
/**
* @Description
* @Author cqh <caoqingghai@1000phone.com>
* @Version V1.0
* @Since 1.0
* @Date 2019/4/15 10:05
*/
public class RateMapper extends Mapper<LongWritable, Text,RateBean, NullWritable> {
ObjectMapper objectMapper;
@Override
//setup方法仅调用一次,一般用来给一些变量赋值
protected void setup(Context context) throws IOException, InterruptedException {
objectMapper = new ObjectMapper();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
RateBean rateBean = objectMapper.readValue(value.toString(), RateBean.class);
context.write(rateBean,NullWritable.get());
}
@Override
//仅调用一次,maptask执行完成后调用一次,用于关闭一些资源
protected void cleanup(Context context) throws IOException, InterruptedException {
super.cleanup(context);
}
}
五、reducer
package topn;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* @Description
* @Author cqh <caoqingghai@1000phone.com>
* @Version V1.0
* @Since 1.0
* @Date 2019/4/15 10:23
*/
public class RateReducer extends Reducer<RateBean, NullWritable,RateBean, NullWritable> {
@Override
protected void setup(Context context) throws IOException, InterruptedException {
super.setup(context);
}
@Override
protected void reduce(RateBean key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
int topN = context.getConfiguration().getInt("topN",4);
int count = 0;
for (NullWritable value:values){
context.write(key,NullWritable.get());
count++;
if (count==topN){
return;
}
}
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
super.cleanup(context);
}
}
六、runner
package topn;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.Logger;
import wordcount.WordCountMapper;
import wordcount.WordCountReduce;
import java.io.IOException;
/**
* @Description
* @Author cqh <caoqingghai@1000phone.com>
* @Version V1.0
* @Since 1.0
* @Date 2019/4/15 10:53
*/
public class RateRunner {
private static Logger logger = Logger.getLogger(RateRunner.class);
public static void main(String[] args) {
try {
Configuration conf = new Configuration();
//conf.set("topN","5");
Job job = Job.getInstance(conf,"topN");
logger.info("一切正常================================");
job.setMapperClass(RateMapper.class);
job.setReducerClass(RateReducer.class);
/**
* map端输出的数据要进行序列化,所以我们要告诉框架map端输出的数据类型
*/
job.setMapOutputKeyClass(RateBean.class);
job.setMapOutputValueClass(NullWritable.class);
/**
* reduce端要输出,所有也要指定数据类型
*/
job.setOutputKeyClass(RateBean.class);
job.setOutputValueClass(NullWritable.class);
/**
* 告诉框架用什么组件去读数据,普通的文本文件,就用TextInputFormat
* 导入长包
*/
job.setInputFormatClass(TextInputFormat.class);
/**
* 告诉这个组件去哪儿读数据
* TextInputFormat有个父类FileInputFormat
* 用父类去指定到哪儿去读数据
* 输入路径是一个目录,该目录下如果有子目录需要进行设置递归遍历,否则会报错
*/
FileInputFormat.addInputPath(job,new Path(args[0]));
job.setPartitionerClass(RatePartitioner.class);
job.setGroupingComparatorClass(RateGroupingComparable.class);
FileSystem fs = FileSystem.get(conf);
Path out = new Path(args[1]);
if (fs.exists(out)){
logger.info("目录存在,删除================================");
fs.delete(out,true);
}
FileOutputFormat.setOutputPath(job,out);
boolean res = job.waitForCompletion(true);
System.exit(res?0:1);
} catch (Exception e) {
logger.error("获取job异常",e);
e.printStackTrace();
}
}
}