6.824 Lab1 MapReduce解析与实现

最新推荐文章于 2024-06-16 17:22:24 发布

Enzo邵靳天

最新推荐文章于 2024-06-16 17:22:24 发布

阅读量402

点赞数

分类专栏：分布式

本文链接：https://blog.csdn.net/qq_39871498/article/details/91357458

版权

分布式专栏收录该内容

0 篇文章 0 订阅

订阅专栏

MapReduce, 批处理的典型之一。主要思想即“分而治之”，将一大批数据（一个大任务）分成多个子任务，分别进行运算（同时）（map），再将运算结果合起来（reduce）

master: 负责任务调度

mapper: 执行各个子任务，map运算

reducer: 执行结果汇总，reduce运算

例：在K/V的wordCount中，源数据为一个大文件，每个mapper负责一部分文件的count，mapper的运算结果(intermediate K/Vs)再交由reducer进行汇总(merge)。
本次实验中对MapReduce进行了基本的使用和简单的任务分发实现。

Part 1: Map/Reduce input and output

task: 完成Map/Reduce的工作流程doMap()和doReduce部分

doMap()：读取输入的文件，创建nReduce个中间结果文件，调用mapF函数并将其计算的K/V结果通过hash散列映射到各中间结果文件。

	//	copyright  by  sjt@hnu.edu.cn
	
func doMap(
	jobName string, // the name of the MapReduce job
	mapTask int, // which map task this is
	inFile string,
	nReduce int, // the number of reduce task that will be run ("R" in the paper)
	mapF func(filename string, contents string) []KeyValue) {
	// openfile
	inputFile , err := os.Open(inFile)

	if err != nil {
		log.Fatalln("doMap : open file failure " ,err)

	}
	//close file
	defer inputFile.Close()

	// read fileInfo
	fileInfo , err := inputFile.Stat()

	if err != nil{
		log.Fatalln("daMap : getState failure " ,err )

	}

	//create  byte slice  to  prepare  for  Reading  inputFile  and  Writing  inputFile to this byte slice
	data := make([]byte, fileInfo.Size() )
	//ignore  returned  first variable(n bytes which are  bytes of inputFile )
	//Read  inputFile and Write data of inputFile into data variable
	_,err = inputFile.Read(data)

	//read err
	if err != nil {
		log.Fatalln("doMap : Read inputFile failure ",err)

	}


	//recall  mapF

	KeyValues  := mapF(fileInfo.Name(), string(data))


	//create  nReduce  intermediate  files
	for i := 0 ; i<nReduce ; i++ {
		filename := reduceName(jobName,mapTask,i)
		//create a new empty  file  called  filename
		reduceFile , err := os.Create(filename)
		if err != nil {
			log.Fatalln("doMap : reduceFile create failure " ,err)
		}

		defer  reduceFile.Close()

		enc := json.NewEncoder(reduceFile)
		// ignore  index of  k-v array
		// pick right k-v
		for _ , kv  :=  range KeyValues {

			// pick right k-v for specific  reduceFile
			//%nReduce =  0 , 1 ,... ,nReduce -1
			if ihash(kv.Key)%(nReduce) == i {

				err := enc.Encode(&kv)
				if err != nil {
					log.Fatalln("doMap: encode kv failure " ,err)
				}
			}

		}


	}

doReduce()：读取中间结果文件，将各K/V按K值存入一个字典变量kvMap，调用reduceF函数以对kvMap进行合并(wordCount无需额外操作)，并将reduceF的结果写入最终结果文件。
重点：清楚操作流程，Go的文件操作

/*copyright by sjt@hnu.edu.cn
*/
func doReduce(
	jobName string, // the name of the whole MapReduce job
	reduceTask int, // which reduce task this is
	outFile string, // write the output here
	nMap int, // the number of map tasks that were run ("M" in the paper)
	reduceF func(key string, values []string) string,
) {

	
	//fmt.Printf("*******outFile : %s , nMap : %d *********" , outFile ,nMap)

	/*
		make map for (key , values )  to stores intermediate files

		A =1
		A =2
		A =3 ...
	*/

	kvMap := make(map[string]([]string ))

	/*
	read   files  like  mrtmp .test -x -x

	operate  read all k-v in kvMap
	 */

	for m:= 0; m <nMap; m++ {
		fileName := reduceName(jobName,m,reduceTask)

		inPutFile ,err := os.Open(fileName)

		if err != nil{
			log.Fatalln("doReduce  inPutfile  open err ",err)
		}

		defer inPutFile.Close()


		// read  KeyValue from inPutFile
		dec := json.NewDecoder(inPutFile)

		for dec.More() {

			var kv KeyValue
			err := dec.Decode(&kv)

			if err != nil {
				log.Fatalln("doReduce kv decode err ",err)

			}

			kvMap[kv.Key] = append(kvMap[kv.Key],kv.Value)

		}



	}



	/*
	sort   kvMap  by Key
	 */

	keysOrdered := make([]string , 0, len(kvMap))

	for key , _ :=  range kvMap {

		keysOrdered = append(keysOrdered , key)
	}

	sort.Strings(keysOrdered)



	/*
	encoder output  part
	 */


		outPutFile ,err := os.Create(outFile)

		if err != nil{
			log.Fatalln("doReduce  outPutFile generate err ",err)
		}

		enc := json.NewEncoder(outPutFile)

		for _,key:= range keysOrdered {

			err := enc.Encode(KeyValue{key, reduceF(key, kvMap[key])})

			if err != nil {
				log.Fatalln( "doReduce encoder err" ,err)
			}

		}

		defer outPutFile.Close()


}

Part 2: 实现单节点的wordCount

task: 即实现task1中调用的mapF()和reduceF函数。

mapF(): 将输入流(string)分割成单词并存入字典sliceKV{word: count}
func mapF(filename string, contents string) []mapreduce.KeyValue {... ...}
reduceF(): wordCount中的此函数无需额外操作。在doReduce()中将中间结果存入kvMap时即完成了合并。

Part3: distributing MapReduce tasks

task: 实现schedule.go文件，完成对多个worker进行map、reduce 任务的调度。

registerChan: workers(RPC address)的channel，存储当前空闲可用的worker（registered worker）
taskChan: 记录未完成或失败的任务，和sync.WaitGroup共同使用，以便重新执行失败任务。
具体实现如下：


 
 
   
   
    
    
   
   
   
   
    
        
     
     var ntasks 
     
     int
    
    
   
   

   
   
    
    
   
   
   
   
    
        
     
     var n_other 
     
     int 
     
     // number of inputs (for reduce) or outputs (for map)
    
    
   
   

   
   
    
    
   
   
   
   
    
        
     
     switch phase {  
     
     // 确定当前工作阶段，执行map还是reduce调度
    
    
   
   

   
   
    
    
   
   
   
   
    
        
     
     case mapPhase:
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
             ntasks = 
     
     len(mapFiles)
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
             n_other = nReduce
    
    
   
   

   
   
    
    
   
   
   
   
    
        
     
     case reducePhase:
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
             ntasks = nReduce
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
             n_other = 
     
     len(mapFiles)
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
         }
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
         fmt.Printf(
     
     "Schedule: %v %v tasks (%d I/Os)\n", ntasks, phase, n_other)
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
        
     
     var wg sync.WaitGroup
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
         taskChan := 
     
     make(
     
     chan 
     
     int)
    
    
   
   

   
   
    
    
   
   
   
   
    
        
     
     go 
     
     func() {
    
    
   
   

   
   
    
    
   
   
   
   
    
            
     
     for t := 
     
     0; t < ntasks; t++ {
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
                 wg.Add(
     
     1)
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
                 taskChan <- t
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
             }
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
             wg.Wait()  
     
     // 直至ntasks个任务全完成
    
    
   
   

   
   
    
    
   
   
   
   
    
            
     
     close(taskChan)
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
         }()
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
        
     
     for task := 
     
     range taskChan {
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
             addr := <-registerChan  
     
     // 取一空闲worker
    
    
   
   

   
   
    
    
   
   
   
   
    
            
     
     go 
     
     func(j int, addr string) {  
     
     // 通过RPC调用执行任务
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
                 result := call(addr, 
     
     "Worker.DoTask", DoTaskArgs{jobName, mapFiles[j], phase, j, n_other}, 
     
     nil)
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
                 fmt.Println(
     
     "46 current: ", j)
    
    
   
   

   
   
    
    
   
   
   
   
    
                
     
     if result {
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
                     wg.Done()
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
                 } 
     
     else {
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
                     taskChan <- j  
     
     // 该任务失败，需再次放入taskChan以便重新执行，直至成功
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
                 }
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
                 registerChan <- addr  
     
     // 执行完任务的worker放回registerChan，以继续执行其他任务 
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
             }(task, addr)
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
         }
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
         wg.Wait()
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
         fmt.Printf(
     
     "Schedule: %v done\n", phase)
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     }

重点：使用goroutine+channel+waitgroup进行协程同步和通信

goroutine: golang的性能保障，用户态的调度粒度，每个goroutine在用户内存中有自己的栈空间。
channel: 传递数据的管道，常用于存储各种中间结果，在复杂的逻辑处理中实用得不得了。
sync.Waitgroup: 等待一组协程的完成。类似任务队列的数据结构，任务完成会被移出，否则会阻塞当前执行线程。提供以下三个操作:
- Add(): 等待的goroutine加一
- Done(): 有一个任务完成，等待的goroutine减一
- Wait(): 阻塞，直至所有任务完成

Enzo邵靳天

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
6.824 Lab1 MapReduce解析与实现

MapReduce, 批处理的典型之一。主要思想即“分而治之”，将一大批数据（一个大任务）分成多个子任务，分别进行运算（同时）（map），再将运算结果合起来（reduce）master: 负责任务调度mapper: 执行各个子任务，map运算reducer: 执行结果汇总，reduce运算例：在K/V的wordCount中，源数据为一个大文件，每个mapper负责一部分文件的count，m...
复制链接

扫一扫

专栏目录