目录
实验要求
- 实现倒排索引效果:统计每个单词在不同文件中的出现次数;倒排索引的原理参考实验说明;
- 输入:自己编辑几个文件,例如 a.txt,b.txt,c.txt。每个文件的内容为若干行单词,单词之间以空格分开,并将这些文件上传到 hdfs 的/in 目录下;例如:a.txt
包含内容:
hadoop google scau
map hadoop reduce
hive hello hbase - 编写程序实现单词的倒排索引效果;
- 分区要求:以 A-M 字母开头(包含小写)的单词出现在 0 区;以 N-Z 字母开头的单词出现在 1 区;其余开头的单词出现在 2 区;
- 单词的输出形式:hadoop a.txt->2,b.txt->1,其中hadoop 是单词(也作为输出的 key),”a.txt->2,b.txt->1”表示输出的 value,即表示hadoop单词在 a.txt 文件中出现次数为 2,在 b.txt文件中出现次数为 1;
代码
第一次mapreduce
mapper类
K1为Object, V1,为Text , K2 为字符串 Text,V2 为 IntWritable
输出形式: 文件名->单词 1
package mr.index.first;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
public class FirstIndexMapper extends Mapper<Object, Text, Text, IntWritable> {
String filename;
Text k = new Text();
IntWritable v = new IntWritable();
@Override
protected void setup(Mapper<Object, Text, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
// 获取文件名称
FileSplit split = (FileSplit) context.getInputSplit();
filename = split.getPath().getName();
}
@Override
protected void map(Object key, Text value, Mapper<Object, Text, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
String line = value.toString();
String[] words = line.split(" ");
for (String word : words) {
k.set(word + "->" + filename);
v.set(1);
context.write(k, v);
}
}
}
reducer类
K3 同 K2 类型为 Text, V3 为 IntWritable
输出形式: 文件名->单词 count
package mr.index.first;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class FirstIndexReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
IntWritable v = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values,
Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
int count=0;
for(IntWritable value : values) {
count+=value.get();
}
v.set(count);
context.write(key, v);
}
}
main
package mr.index.first;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class FirstIndexMain {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(FirstIndexMain.class);
job.setMapperClass(FirstIndexMapper.class);
job.setReducerClass(FirstIndexReducer.class);
job.setNumReduceTasks(1);
job.setMapOutputKeyClass(Text.class);// map阶段的输出的key
job.setMapOutputValueClass(IntWritable.class);// map阶段的输出的value
job.setOutputKeyClass(Text.class);// reduce阶段的输出的key
job.setOutputValueClass(IntWritable.class);// reduce阶段的输出的value
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
boolean res = job.waitForCompletion(true);
System.exit(res ? 0 : 1);
}
}
第二次mapreduce
● (以第一次的输出目录作为输入目录,将word 作为 key,value 为文件名->计数形式的集合)
mapper类
k1为Object,v1为Text,K2 为 Text,V2 为 Text
输出形式: 单词 文件名->count
package mr.index.second;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class SecondIndexMapper extends Mapper<Object, Text, Text, Text> {
Text k = new Text();
Text v = new Text();
@Override
protected void map(Object key, Text value, Mapper<Object, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
String line = value.toString();
String[] lines = line.split("->");
k.set(lines[0]);
v.set(lines[1]);
context.write(k, v);
}
}
reducer类
K3 同 K2 类型为 Text, V3 为 Text
输出形式:单词 文件名 1->count1 文件名2->count2
package mr.index.second;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class SecondIndexReducer extends Reducer<Text, Text, Text, Text> {
Text v = new Text();
@Override
protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
StringBuilder s = new StringBuilder();
for(Text value : values) {
s.append(value.toString().replace("\t", "->")+" ");
}
v.set(s.toString());
context.write(key, v);
}
}
partitioner
在mapper后执行,传入的数据类型与k2v2一致
package mr.index.second;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
public class SecondIndexPartitioner extends Partitioner<Text, Text> {
private static int PartitionNumber=0;
@Override
public int getPartition(Text key, Text value, int numPartitions) {
String word=key.toString().trim();
if (word.length()==0) return 0;
char firstchar=Character.toUpperCase(word.charAt(0));
if(firstchar>='A'&&firstchar<='M')
PartitionNumber=0;
else if(firstchar>='N'&&firstchar<='Z')
PartitionNumber=1;
else PartitionNumber=2;
return PartitionNumber;
}
}
main
package mr.index.second;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class SecondIndexMain {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(SecondIndexMain.class);
job.setMapperClass(SecondIndexMapper.class);
job.setReducerClass(SecondIndexReducer.class);
job.setPartitionerClass(SecondIndexPartitioner.class);
job.setNumReduceTasks(3);
job.setMapOutputKeyClass(Text.class);// map阶段的输出的key
job.setMapOutputValueClass(Text.class);// map阶段的输出的value
job.setOutputKeyClass(Text.class);// reduce阶段的输出的key
job.setOutputValueClass(Text.class);// reduce阶段的输出的value
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
boolean res = job.waitForCompletion(true);
System.exit(res ? 0 : 1);
}
}
实验数据
运行结果
第一次mapreduce结果
第二次mapreduce结果
总结
- 倒排索引通过两次mapreduce实现,第一次mapreduce代码逻辑与代码结构与之前的 WordCount 程序有很多相似的地方,就是一个k2包含文件名的WordCount;第二次则在 Reducer 中累加的不是单词的个数,而是 value 字符串的适当叠加。
- 对于v1,无论v1是第一次中的word还是第二次中一个IntWritable的count,从文件中读入的v1都是Text类型,才能从而实现对v1的分割。