import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashMap;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
//统计每个关键词在每个文档中,第几行出现了多少次
public class kaoshi1 {
private static int count1 =0;
private static int count2 =0;
static class MyMapper extends Mapper<LongWritable, Text, Text, Text>{
Text mk=new Text();
Text mv=new Text();
String filename="";
@Override
//setup job任务运行时加载一次,可以获取文件信息
protected void setup(Mapper<LongWritable, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
//通过文件的切片信息,获取文件名
InputSplit insplit = context.getInputSplit();
FileSplit fs=(FileSplit)insplit;
filename = fs.getPath().getName();
}
@Override
protected void map(LongWritable key,
Text value,
Context context)
throws IOException, InterruptedException {
Map<String, Integer> word = new HashMap<>();//先创建一个map集合用于统计所在行的相同单词出现个数
//liangchaowei love liujialing
String[] sp = value.toString().split(" ");
if(filename.startsWith("mapreduce-4-1.txt")){//先判断访问的是文件
count1++; //单词所在的文件中的行号
for(String v:sp){
if(word.containsKey(v)){ //判断是否添加过
word.put(v,word.get(v)+1); //所在行的单词个数
}else{
word.put(v, 1);
}
System.out.println(count1+"----------------");
}
for(String k:word.keySet()){
mk.set(k);
mv.set(filename+":"+count1+","+word.get(k));//将文件名:行号,出现次数封装当value中
System.out.println(word.get(k));
context.write(mk, mv);
}
}else{ //与上同理
count2++;
for(String v:sp){
if(word.containsKey(v)){
word.put(v,word.get(v)+1);
}else{
word.put(v, 1);
}
}
for(String k:word.keySet()){
mk.set(k);
mv.set(filename+":"+count2+","+word.get(k));
System.out.println(word.get(k));
context.write(mk, mv);
}
}
}
}
static class MyReducer extends Reducer<Text, Text, Text, Text>{
Text outValue = new Text();
String[] sp;
@Override
protected void reduce(Text key,
Iterable<Text> values,
Reducer<Text, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
StringBuffer sb = new StringBuffer(); //创建一个包装类
//txt.1:1 txt.1:1:2
System.out.println(key+"--------------++++++++++++");
for(Text v:values){
String[] sp = v.toString().split(":"); //测试数据用
sb.append(v.toString()+"\t"); //将values内容进行拼接
System.out.println(sp[0]); //测试是否取到数据
}
outValue.set(sb.toString());
context.write(key, outValue);
}
}
public static void main(String[] args) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException {
//本地运行添加映射
System.setProperty("HADOOP_USER_NAME", "hadoop");
//添加配置文件
Configuration conf = new Configuration();
Job job = Job.getInstance(conf); //创建Job任务
job.setJarByClass(kaoshi1.class); //jar运行主类(驱动类driver)
job.setMapperClass(MyMapper.class); //指定mapper加载的类
job.setReducerClass(MyReducer.class); //指定reducer加载的类
job.setOutputKeyClass(Text.class); //指定任务的输出的key类型
job.setOutputValueClass(Text.class); //指定任务的输出的value类型
FileInputFormat.addInputPath(job, new Path("hdfs://hadoop01:9000/ksin02"));//指定加载路径
FileSystem fs = FileSystem.get(new URI("hdfs://hadoop01:9000"), conf);//获取hdfs的文件系统路径--对象
Path path = new Path("/ksout05");//输出结果文件路径
if(fs.exists(path)){ //防止目录存在,启动失败
fs.delete(path,true);
}
FileOutputFormat.setOutputPath(job, path); //指定输出路径---(目录不能存在)
job.waitForCompletion(true); //是否打印日志
}
}
编写MapReduce :统计每个关键词,所在文件及,第几行出现了多少次
最新推荐文章于 2022-10-24 18:02:42 发布