2_MapReduce原理

最新推荐文章于 2022-06-10 19:27:27 发布

oifengo

最新推荐文章于 2022-06-10 19:27:27 发布

阅读量200

点赞数

分类专栏： # 爬梯

本文链接：https://blog.csdn.net/weixin_39381833/article/details/107164547

版权

爬梯专栏收录该内容

47 篇文章 0 订阅

订阅专栏

文章目录

前言
- Properties
MR
- 流程解析
2 WorldCount实现
3 MR

前言

context 缓存
constants
intermediate
sort
Properties
utils
expert
recursive

Properties

使用JAVA读写Properties属性文件

比较少的数据，配置信息等
上传到DB太过于麻烦，可以直接保存为Properties。原因：
Properties类有专门的读写方法来读写Properfies属性文件，不用担心读写格式的问题，只需要给Properfies类提供一个读写流即可

//读取属性文件流的方法
public void load(ibputSteram inStream) throws IOException{}
//写属性文件流的防范
pulic void store(OutputStream out,String comments) throws IOException{}

1 指定文件流

//属性集合对象
Properties prop = new Properties();
//属性文件流
FileInputStream fis = new FileInputStream("prop.properfiles")
//将属性文件流装载到Properties
prop.load(fis)

2 读取属性
属性文件都是键值对

//获取属性值，sitename已经在文件中定义
prop.getProperty("sitename")
//获取属性值，返回一个默认值，但并不修改属性文件
prop.getProperty("county","China")

3 修改保存属性

修改name的属性值
prop.setProperty("name","xxx");
//添加一个新的属性studio
prop.setProperty("age","22");

//文件输出流
FileOutputStream fos = new FileOutputStream("pro.properties");
//将Properties集合保存到流中
prop.store(fos."Copyright(c) Boxcode Studio")
fos.close();

MR

在这里插入图片描述

流程解析

在这里插入图片描述
Map阶段

一个block对应一个Maptask
一个Maptask按照分隔符进行切割
每个单词出现一次

Shuffle阶段

相同的key分发到同一个reduce处理

Reduce阶段

Recude会根据指定进行划分

比如图中 a - n 到第一个ReduceTask
	o - z 到第二个RecudeTask

2 WorldCount实现

2.1 Context

缓存用来存放数据出现的次数

package wc;

import java.util.HashMap;
import java.util.Map;

public class ifengContext {

    private Map<Object,Object> cacheMap = new HashMap<>();

    public Map<Object, Object> getCacheMap() {
        return cacheMap;
    }

    public void write(Object key,Object value){
        cacheMap.put(key,value);
    }



    public Object get(Object key){
        return cacheMap.get(key);
    }


}

2.2 Mapper接口

实现一个Mapper接口，
进来的数据为一行一行写入到缓存中去

package wc;

public interface ifengMapper {

    public abstract void map(String line, ifengContext context);
}

2.3 WordCountMapper

package wc;

public class WordCountMapper implements ifengMapper {


    @Override
    public void map(String line, ifengContext context) {
        String[] splits = line.split(",");
        for(String word : splits){
            Object value = context.get(word);

            if(null == value){
                context.write(word,1);
            }else{
                context.write(word, Integer.parseInt(value.toString()) + 1);
            }
        }
    }
}

2.4 main

package wc;



import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.Map;


public class HDFSWcAPP_P {

    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();
        conf.set("dfs.client.use.datanode.hostname","true");
        conf.set("dfs.replication","1");
        URI uri = new URI("hdfs://10.103.66.15:9000");
        FileSystem fileSystem = FileSystem.get(uri, conf, "hadoop");

        ifengMapper mapper = new WordCountMapper();
        ifengContext context = new ifengContext();


        Path input = new Path("/hdfsapi1/");
        RemoteIterator<LocatedFileStatus> iterator = fileSystem.listFiles(input, false);
        while(iterator.hasNext()) {
            LocatedFileStatus file = iterator.next();
            FSDataInputStream in = fileSystem.open(file.getPath());
            BufferedReader reader = new BufferedReader(new InputStreamReader(in));
            String line = "";
            while((line = reader.readLine()) != null) {
                // System.out.println(line);
                mapper.map(line, context);
            }
            reader.close();
            in.close();
        }

        Map<Object, Object> cacheMap = context.getCacheMap();
//
        // path = parent + child  ==  /hdfsapi/wc/out/wc.out
        Path output = new Path("/hdfsapi1/wc/out/");
        FSDataOutputStream out = fileSystem.create(new Path(output, new Path("wc.out")));
        for(Map.Entry<Object,Object> entry :  cacheMap.entrySet()) {
            System.out.println(entry.getKey() + "..." + entry.getValue());
            out.write((entry.getKey() + "\t" + entry.getValue() + "\n").getBytes());
        }

        out.close();
        fileSystem.close();

    }
}

在这里插入图片描述

3 MR

序列化

为了让数据在集群之间IO，必须把数据序列化
序列化：内存的对象转成字节数组以便于存储或者网络的数据传输

	A   JVM  
	B   JVM
	C   JVM 
	..
	N   JVM

反序列化：字节数组转成对象

boolean BooleanWritable
int IntWritable
byte ByteWritable
float FloatWirtable
long LongWritable
double DoubleWirtable
string Text

Driver

package MapReduce;

/*
* 八股文编程
* */


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;


public class WordCountDriver {


    public static void main(String[] args) throws IOException, IOException, ClassNotFoundException, InterruptedException {
        String input = "data/wc.txt";
        String output = "out/wcout.txt";

        //1. 获取Job
        Configuration configuration = new Configuration();
        Job job = Job.getInstance();


        //2. 设置主类
        job.setJarByClass(WordCountDriver.class);
        //3. 设置Mapper 和 Reducer
        job.setMapperClass(WordCountMapper.class);
        job.setReducerClass(WordCountReducer.class);

        //4. 设置Mapper阶段输出的key和value类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        //5. 设置Reduce阶段输出的key和value类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        //6. 设置输入和输出的路径
        FileInputFormat.setInputPaths(job,new Path(input));
        FileOutputFormat.setOutputPath(job,new Path(output));

        //7. 提交Job
        boolean result = job.waitForCompletion(true);
        System.exit(result? 0:1 );

    }

}

设置判断输出路径是否已经存在

public class FileUtils {

    public static void deleteOutput(Configuration configuration, String output) throws Exception {
        FileSystem fileSystem = FileSystem.get(configuration);
        Path outputPath = new Path(output);
        if(fileSystem.exists(outputPath)) {
            fileSystem.delete(outputPath, true);
        }
    }
}

oolean result = job.waitForCompletion(true);

waitForCompletion{
	submit{
			connect{
				Cluster{
					
				}
			}
		}
}

job.waitForCompletion
- job.sunbimt
  - submitter
    - checkSpecs
    - submitJobDir
    - copyAndConfigureFiles
    - submitJobFile
- cluster

序列化接口

序列化必须实现Writable接口

public interface Writable {

writable接口中给的自定义序列化

*     public class MyWritable implements Writable {
 *       // Some data     
 *       private int counter;
 *       private long timestamp;
 *       
 *       public void write(DataOutput out) throws IOException {
 *         out.writeInt(counter);
 *         out.writeLong(timestamp);
 *       }
 *       
 *       public void readFields(DataInput in) throws IOException {
 *         counter = in.readInt();
 *         timestamp = in.readLong();
 *       }
 *       
 *       public static MyWritable read(DataInput in) throws IOException {
 *         MyWritable w = new MyWritable();
 *         w.readFields(in);
 *         return w;
 *       }
 *     }

输入数据 ==> InputFormat
					==> 有几个InputSplit
						==> MapTask
					==> RecordReader
						==> LineRecordReader

SplitSize 的最大最小值

long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
	max(1,1) = 1
long maxSize = getMaxSplitSize(job);
	0x7fffffffffffffffL

splitSize = Math.max(minSize, Math.min(maxSize, blockSize));

	max(1, min(32M, 0x7fffffffffffffffL))
	max(1, 32M) = 32M