关注专栏《破茧成蝶——大数据篇》查看相关系列的文章~
目录
一、需求分析
现有一堆日志文件,需要对日志文件中的某些单词或字符进行次数统计,并与每个日志文件一一对应,实现索引的建立。原始日志文件内容如下所示:
期望得到的结果数据为:
实现思路分析:以上的需求可以分为两步进行实现。1、首先统计出买个单词在不同日志文件中出现的次数。2、根据不同的单词分组合并数据。
二、代码实现
2.1 第一步中的Mapper方法
package com.xzw.hadoop.mapreduce.inverted_index;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
/**
* @author: xzw
* @create_date: 2020/12/7 8:40
* @desc:
* @modifier:
* @modified_date:
* @desc:
*/
public class OneIndexMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
String name;
Text k = new Text();
IntWritable v = new IntWritable();
@Override
protected void setup(Context context) throws IOException, InterruptedException {
//获取文件名称
FileSplit split = (FileSplit) context.getInputSplit();
name = split.getPath().getName();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//1、获取一行数据并进行切割
String line = value.toString();
String[] fields = line.split(" ");
for (String word: fields) {
//2、进行字符串的拼接
k.set(word + "--" + name);
v.set(1);
//3、写出
context.write(k, v);
}
}
}
2.2 第一步中的Reducer方法
package com.xzw.hadoop.mapreduce.inverted_index;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* @author: xzw
* @create_date: 2020/12/7 8:49
* @desc:
* @modifier:
* @modified_date:
* @desc:
*/
public class OneIndexReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
IntWritable v = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0; //累加器
for (IntWritable value: values) {
sum += value.get();
}
v.set(sum);
context.write(key, v);
}
}
2.3 第一步中的Driver类
package com.xzw.hadoop.mapreduce.inverted_index;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
* @author: xzw
* @create_date: 2020/12/7 9:00
* @desc:
* @modifier:
* @modified_date:
* @desc:
*/
public class OneIndexDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
args = new String[]{"C:\\Users\\Machenike\\Desktop\\file", "C:\\Users\\Machenike\\Desktop\\file\\output1"};
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration);
job.setJarByClass(OneIndexDriver.class);
job.setMapperClass(OneIndexMapper.class);
job.setReducerClass(OneIndexReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
boolean b = job.waitForCompletion(true);
System.exit(b ? 0 : 1);
}
}
2.4 第一步测试结果
2.5 第二步中的Mapper方法
package com.xzw.hadoop.mapreduce.inverted_index;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* @author: xzw
* @create_date: 2020/12/7 9:21
* @desc:
* @modifier:
* @modified_date:
* @desc:
*/
public class TwoIndexMapper extends Mapper<LongWritable, Text, Text, Text> {
Text k = new Text();
Text v = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] fields = line.split("--");
k.set(fields[0]);
v.set(fields[1]);
context.write(k, v);
}
}
2.6 第二步中的Reducer方法
package com.xzw.hadoop.mapreduce.inverted_index;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* @author: xzw
* @create_date: 2020/12/7 9:26
* @desc:
* @modifier:
* @modified_date:
* @desc:
*/
public class TwoIndexReducer extends Reducer<Text, Text, Text, Text> {
Text v = new Text();
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
StringBuilder sb = new StringBuilder();
for (Text value: values) {
sb.append(value.toString().replace("\t", "-->") + "\t");
}
v.set(sb.toString());
context.write(key, v);
}
}
2.7 第二步中的Driver类
package com.xzw.hadoop.mapreduce.inverted_index;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
* @author: xzw
* @create_date: 2020/12/7 9:38
* @desc:
* @modifier:
* @modified_date:
* @desc:
*/
public class TwoIndexDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
args = new String[]{"C:\\Users\\Machenike\\Desktop\\file\\output1", "C:\\Users\\Machenike\\Desktop\\file" +
"\\output2"};
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration);
job.setJarByClass(TwoIndexDriver.class);
job.setMapperClass(TwoIndexMapper.class);
job.setReducerClass(TwoIndexReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
boolean b = job.waitForCompletion(true);
System.exit(b ? 0 : 1);
}
}
2.8 第二步测试结果