package MRDemo;
import java.io.IOException;
import java.util.HashSet;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import MRDemo.PV.PVCombine;
import MRDemo.PV.PVMap;
import MRDemo.PV.PVReduce;
import com.google.common.collect.Sets;
public class ProductKind {
public static void main(String[] args) throws Exception {
if (args.length!=2) {
System.exit(0);
}
Job job = Job.getInstance(new Configuration(), "ProductKind");
job.setJarByClass(ProductKind.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setMapperClass(ProductCountMap.class);
//job.setCombinerClass(PVCombine.class);
job.setReducerClass(ProductCountReduce.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.waitForCompletion(true);
}
public static class ProductCountMap extends Mapper<LongWritable, Text, Text, Text>{
Text province = new Text();
Text kind = new Text();
protected void map(LongWritable key, Text value, org.apache.hadoop.mapreduce.Mapper<LongWritable,Text,Text,Text>.Context context) throws java.io.IOException ,InterruptedException {
String[] lines = value.toString().split("\t");
if (lines.length == 6) {
kind.set(lines[0].trim());
province.set(lines[4].trim() );
context.write(province, kind);
}
};
}
//
public static class ProductCountReduce extends Reducer<Text, Text, Text, IntWritable>{
protected void reduce(Text key, java.lang.Iterable<Text> values, org.apache.hadoop.mapreduce.Reducer<Text,Text,Text,IntWritable>.Context context) throws java.io.IOException ,InterruptedException {
//context.write(new Text("getHttp_user_agent"), new IntWritable(1));
HashSet<String> hashSet = new HashSet<String>();
for (Text value : values) {
hashSet.add(value.toString());
}
if(hashSet.size()>0){
context.write(key, new IntWritable(hashSet.size()));
}
};
}
}
Mapreduce算法二、数据去重(HashSet)
最新推荐文章于 2025-10-23 12:12:59 发布
本文介绍了一个使用Hadoop MapReduce框架实现的产品种类统计程序。该程序通过Mapper读取每条记录,并提取产品种类和省份信息作为键值对输出;Reducer则负责汇总每个省份下不重复的产品种类数量。
822

被折叠的 条评论
为什么被折叠?



