实验1：MapReduce课程实验

Abandon145

已于 2022-04-18 20:43:21 修改

阅读量1.9k

点赞数

文章标签： intellij-idea

于 2022-04-18 20:39:14 首次发布

本文链接：https://blog.csdn.net/abandon145/article/details/124258687

版权

该博客介绍了MapReduce的实验过程，包括在本地创建'words.txt'文件并上传到HDFS的'/neusoftin'目录，然后通过IntelliJ IDEA实现MapReduce的WordCount工具类，详细讲解了WordCountMapper和WordCountReducer的实现细节。

摘要由CSDN通过智能技术生成

1.本地创建一个文件，"words.txt" ,上传到hdfs： "/neusoftin" 下

package hdfs;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
/**
 * HDFS 文件上传
 * @author Administrator
 *
 */
public class CreateFile {
	
	public static void main(String[] args) throws Exception {
		
		//设置一个配置 服务器所在  信息 
		Configuration conf = new Configuration();
		// linux 上的  hdfs  访问 地址
		conf.set("fs.defaultFS", "hdfs://master:8020");
		// 从服务器 获取 hdfs 文件 操作对象
		FileSystem hdfs = FileSystem.get(conf);
		//找到上传的 文件
		byte[] buf = ("BEIJING, April 14 (Xinhua) According to Sputnik's report on April 14, the Russian " +
				"side said that two Ukrainian combat helicopters entered Russian airspace and attacked " +
				"residential buildings. At present, the Ukrainian side has not yet responded to this " +
				"news. This is the second time since the outbreak of the Russian-Ukrainian conflict " +
				"that Russia has claimed that Ukrainian helicopters have entered Russian airspace." +
				"According to reports, the Russian investigation committee said that Ukrainian helicopters" +
				" carried out at least six attacks on the residential building in Klimovo. The Committee " +
				"has launched a criminal investigation into the attack." +
				"Not long ago, Alexander Kuznetsov, assistant minister of health of Russia, said that " +
				"the Ukrainian army attacked Bryansk, causing seven injuries, and all the injured were " +
				"hospitalized." +
				"For the second time, Russia claimed that Ukrainian warplanes entered Russian airspace." +
				"On April 1, the Russian Defense Ministry announced that an oil facility in Russia was attacked " +
				"and caught fire by Ukrainian military helicopters on the same day. This is the first time that " +
				"Russia has reported Ukraine's air strikes on its territory since Russia launched a special" +
				" military operation against Ukraine on February 24th." +
				"Russian Defense Ministry spokesman Konashenkov said that two Ukrainian Mi -24 helicopters" +
				" entered Russian airspace, targeting a civilian petroleum product on the outskirts" +
				" of Belgorod, the capital of Belgorod.").getBytes();
		//对应  hdfs  路径 
		Path dst = new Path("/neusoftin/words.txt");
		// 创建文件路径
		FSDataOutputStream out = hdfs.create(dst);
		
		out.write(buf, 0, buf.length);// 向文件 传入信息
		out.close();
		// 验证 是否创建成功
		System.out.println(hdfs.exists(dst));
	}
	

}

2.使用Tool工具类实现Mapreduce ：WordCountTool方法

package mapreduce;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.BufferedReader;
import java.io.InputStreamReader;

/**\
 *   通过run 启动多个
 */
public class WordCountTool extends Configured implements Tool {

    public static void main(String[] args) throws Exception {
        // 服务器连接对象
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "hdfs://master:8020");
        FileSystem hdfs = FileSystem.get(conf);
        // 设置读取路径和文件
        String input ="/neusoftin/*.txt";
        String output= "/neusoftout";  // mapreduce  最后的结果，路径不能事先存在
        Path outputpath = new Path(output);
        // 执行前先 删除 结果文件夹;所以 如果为 true
        if(hdfs.exists(outputpath)){
            hdfs.delete(outputpath);
        }
        //工具类中 启动
        args = new String[]{"/neusoftin/*.txt", "/neusoftout"};
        int re =ToolRunner.run(conf,new WordCountTool(),args);

        System.exit(re);
    }

    @Override
    public int run(String[] strings) throws Exception {
        //
        Job job =Job.getInstance(getConf());

        job.setJarByClass(WordCountMain.class);//执行jar启动类

        job.setInputFormatClass(TextInputFormat.class);
        TextInputFormat.setInputPaths(job,strings[0]);// 输入入口
        //Mapper
        job.setMapperClass(WordCountMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        job.setCombinerClass(WordCoundCombiner.class);
        // reducer
        job.setReducerClass(WordCountReducer.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        // 结果文件 输出
        job.setOutputFormatClass(TextOutputFormat.class);
        TextOutputFormat.setOutputPath(job,new Path(strings[1]));//  执行输出

        //运行
        boolean  result = job.waitForCompletion(true);
        FileSystem hdfs = FileSystem.get(getConf());
        if(result){
            // 获取hdfs 路径下的
            for(FileStatus fs: hdfs.listStatus(new Path(strings[1]))){

                FSDataInputStream dis = hdfs.open(fs.getPath());
                //用IOUtils下的copyBytes将流中的数据打印输出到控制台
                BufferedReader reader  = new BufferedReader(new InputStreamReader(dis)); // 字节转字符
                String line = reader.readLine();
                while(line!=null){
                    System.out.println(line);
                    line = reader.readLine();
                }
            }
        }
        return 0;
    }


}

3.WordCountMapper方法

package mapreduce;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;
import java.util.StringTokenizer;

/**
 * 汇总
 * in传入
 *
 * LongWritable, Text
 *
 *  LongWritable : 读取文件内容的偏移位置（）
 *        读取方式 默认行
 *        Text ： 读取一行的数据，每读取一行，调用一次map
 *       返回Map
 *        key：
 *              字符
 *        value：
 *             统计、汇总、排序、
 * out传出
 * Text, IntWritable
 *
 */
public class WordCountMapper extends Mapper<LongWritable, Text,Text, IntWritable> {


    //  优化 写法

    private Text outMapKey= new Text();

    private static final IntWritable outMapValue = new IntWritable(1);

    /**
     *
     * @param key
     * @param value   传入的文本
     * @param context   返回的map
     * @throws IOException
     * @throws InterruptedException
     */
    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
        //获取 传入需要 统计的信息
        String line = value.toString();
        //分片
//        line.split(" ");     是否为空的判定
        if(StringUtils.isBlank(line)){
            return ;
        }
        //调用工具类  差分 获取单词
        StringTokenizer st = new StringTokenizer(line);
        while(st.hasMoreTokens()){ // 循环判断是否还有  可以或缺的 单词
            String word =st.nextToken(); // 向下获取单词
            outMapKey.set(word);
            context.write(outMapKey, outMapValue);  // 向reduce 传递 信息 key  和value
        }
    }
}

4.WordCountReducer方法

package mapreduce;


import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;


import java.io.IOException;
import java.util.StringTokenizer;

/**
 *
 *   map传入           返回map
 * Text, IntWritable,Text ,IntWritable
 */

public class WordCountReducer  extends Reducer<Text, IntWritable,Text ,IntWritable> {

    /**
     *
     * @param key
     * @param values
     * @param context
     * @throws IOException
     * @throws InterruptedException
     */
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {

        int sum=0; // 定义求和变量
        for(IntWritable value :values){  //循环  vaues
            sum+= value.get();  //+1
        }

        context.write(key,new IntWritable(sum));// 返回map
    }
}