数据1:
huangbo love xuzheng
huangxiaoming love baby huangxiaoming love mimi
liangchaowei love liujialing
数据2:
hello huangbo
hello xuzheng
hello huangxiaoming
题目一:编写 MapReduce 求出以下格式的结果数据:统计每个关键词在每个文档中当中的第几行出现了多少次。
例如,huangxiaoming 关键词的格式:
huangixaoming mapreduce-4-1.txt:2,2; mapreduce-4-1.txt:4,1;mapreduce-4-2.txt:3,1
首先是进行文件的额切分,拼接添加行号,以单词为key,文件名和行号进行拼接做为value,然后通过第二个MapRudece程序将数据组合成我们需要的。样式。
第一个MapReduce程序
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Question3_1_1 {
public static class MRMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
Text k = new Text();
IntWritable v = new IntWritable(1);
int num = 0;
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
//行号
num++;
String[] words = line.split(" ");
//huangixaoming mapreduce-4-1.txt:2,2; mapreduce-4-1.txt:4,1;mapreduce-4-2.txt:3,1
FileSplit inputSplit = (FileSplit) context.getInputSplit();
//通过切片获取文件的名称
String fileName = inputSplit.getPath().getName();
for (String word : words) {
//单词+文件名+行号 作为key输出
k.set(word + ":" + fileName+ ":" + (num));
System.out.println(word + "--" + fileName+ "--" + (num));
context.write(k, v);
}
}
}
public static class MRReducer extends Reducer<Text, IntWritable, Text, NullWritable> {
Text t = new Text();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
//获取到key 单词+文件名+行号;
//根据key相同,进行累加相同的word出现了几次
int count = 0;
for (IntWritable value : values) {
count += value.get();
}
//转化输出
t.set(key.toString()+","+count);
context.write(t,NullWritable.get());
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
Job job = Job.getInstance(conf);
job.setJarByClass(Question3_1_1.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.setInputPaths(job, new Path("G:/test/q3/input"));
if(fs.exists(new Path("G:/test/q3/output_3_1"))){
fs.delete(new Path("G:/test/q3/output_3_1"), true);
}
FileOutputFormat.setOutputPath(job, new Path("G:/test/q3/output_3_1"));
job.setMapperClass(MRMapper.class);
job.setReducerClass(MRReducer.class);
System.exit(job.waitForCompletion(true) ? 1:0);
}
}
第二个MapReduce程序
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Question3_1_2 {
public static class MRMapper extends Mapper<LongWritable, Text, Text, Text>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] files = line.split(":");
//k.set(word + ":" + fileName+ ":" + (num));
//baby:mapreduce-4-1.txt:2,1
String str = files[1]+":"+files[2];
context.write(new Text(files[0]), new Text(str));
}
}
public static class MRReducer extends Reducer<Text, Text, Text, Text>{
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
StringBuffer sb = new StringBuffer();
for (Text text : values) {
sb.append(text.toString()+";");
}
context.write(key, new Text(sb.toString()));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
Job job = Job.getInstance(conf);
job.setJarByClass(Question3_1_2.class);
job.setMapperClass(MRMapper.class);
job.setReducerClass(MRReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(job, new Path("G:/test/q3/output_3_1"));
if(fs.exists(new Path("G:/test/q3/output_3_2"))){
fs.delete(new Path("G:/test/q3/output_3_2"), true);
}
FileOutputFormat.setOutputPath(job, new Path("G:/test/q3/output_3_2"));
System.exit(job.waitForCompletion(true) ? 1:0);
}
}
题目二:编写 MapReduce 程序求出每个关键词在每个文档出现了多少次,并且按照出现次数降序排序。
例如:
huangixaoming mapreduce-4-1.txt,3;mapreduce-4-2.txt,1
以上答案的含义:
关键词 huangxiaoming 在第一份文档 mapreduce-4-1.txt 中出现了 3 次,在第二份文档mapreduce-4-2.txt 中出现了 1 次。
方案:先统计出每个关键词在某个文件中的出现次数,然后再进行排序。
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Question3_2_1 {
public static class MRMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
Text k = new Text();
IntWritable v = new IntWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
String[] words = line.split(" ");
//huangixaoming mapreduce-4-1.txt:2,2; mapreduce-4-1.txt:4,1;mapreduce-4-2.txt:3,1
FileSplit inputSplit = (FileSplit) context.getInputSplit();
String fileName = inputSplit.getPath().getName();
for (String word : words) {
k.set(word + ":" + fileName);
context.write(k, v);
}
}
}
public static class MRReducer extends Reducer<Text, IntWritable, Text, NullWritable> {
Text t = new Text();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int count = 0;
for (IntWritable value : values) {
count += value.get();
}
t.set(key.toString()+","+count);
context.write(t,NullWritable.get());
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
Job job = Job.getInstance(conf);
job.setJarByClass(Question3_2_1.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.setInputPaths(job, new Path("G:/test/q3/input"));
if(fs.exists(new Path("G:/test/q3/output_3_3"))){
fs.delete(new Path("G:/test/q3/output_3_3"), true);
}
FileOutputFormat.setOutputPath(job, new Path("G:/test/q3/output_3_3"));
job.setMapperClass(MRMapper.class);
job.setReducerClass(MRReducer.class);
System.exit(job.waitForCompletion(true) ? 1:0);
}
}
使用自定义对象,将上面的结果组合成一个自定义对象,然后根据关键词分组,根据出现次数排序;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Question3_2_2 {
//huangixaoming mapreduce-4-1.txt,3;mapreduce-4-2.txt,1
//yangmi:mapreduce-4-1.txt,1
public static class MRMapper extends Mapper<LongWritable, Text, TestBean, NullWritable>{
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String[] line = value.toString().split(":");
TestBean tb = new TestBean(line[0],line[1].split(",")[0],Integer.parseInt(line[1].split(",")[1]));
context.write(tb,NullWritable.get());
}
}
public static class MRReducer extends Reducer<TestBean, NullWritable, Text, Text>{
Text k = new Text();
Text v = new Text();
@Override
protected void reduce(TestBean key, Iterable<NullWritable> values, Context context)
throws IOException, InterruptedException {
StringBuffer sb = new StringBuffer();
for (NullWritable nv : values) {
sb.append(key.getFileName()+","+key.getNum()+";");
}
k.set(key.getName());
v.set(sb.toString());
context.write(k, v);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
Job job = Job.getInstance(conf);
job.setJarByClass(Question3_2_2.class);
job.setMapperClass(MRMapper.class);
job.setReducerClass(MRReducer.class);
job.setMapOutputKeyClass(TestBean.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setGroupingComparatorClass(UserGC.class);
FileInputFormat.setInputPaths(job, new Path("G:/test/q3/output_3_3"));
if(fs.exists(new Path("G:/test/q3/output_3_4"))){
fs.delete(new Path("G:/test/q3/output_3_4"), true);
}
FileOutputFormat.setOutputPath(job, new Path("G:/test/q3/output_3_4"));
System.exit(job.waitForCompletion(true) ? 1:0);
}
}
自定义数据类型:TestBean
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
public class TestBean implements WritableComparable<TestBean>{
private String name;
private String fileName;
private int num;
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getFileName() {
return fileName;
}
public void setFileName(String fileName) {
this.fileName = fileName;
}
public int getNum() {
return num;
}
public void setNum(int num) {
this.num = num;
}
public TestBean() {
super();
// TODO Auto-generated constructor stub
}
public TestBean(String name, String fileName, int num) {
super();
this.name = name;
this.fileName = fileName;
this.num = num;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(name);
out.writeUTF(fileName);
out.writeInt(num);
}
@Override
public void readFields(DataInput in) throws IOException {
name = in.readUTF();
fileName = in.readUTF();
num = in.readInt();
}
@Override
public int compareTo(TestBean o) {
if(o.getName().compareTo(this.getName()) == 0){
int flag = o.getNum()-this.getNum();
if(flag == 0){
return 0;
}else if(flag > 0){
return 1;
}else{
return -1;
}
}else{
return o.getName().compareTo(this.getName());
}
}
}
自定义分组组件:UserGC
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
public class UserGC extends WritableComparator{
public UserGC() {
super(TestBean.class,true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
TestBean pa = (TestBean) a;
TestBean pb = (TestBean) b;
return pa.getName().compareTo(pb.getName());
}
}