Lab详情页:http://nil.csail.mit.edu/6.824/2022/labs/lab-mr.html
本文根据上面详情页中的要求和引导,简要完成了一个分布式MapReduce的实现
代码仓库:Pokhanov/MIT6.824 Distributed System Lab - 码云 - 开源中国 (gitee.com)
前置知识
MapReduce模型 论文摘要-MapReduce: Simplified Data Processing on Large Clusters-CSDN博客
目标
实现分布式MapReduce。这个系统由两个部分组成:
- coordinator程序(对应论文摘要中的主副本)
- worker程序(对应论文摘要中的工人副本)
coordinator负责分配任务给workers,以及在有worker发生故障时进行调度处理,确保整个任务能够完成。
worker负责执行具体任务,会涉及读取输入、处理数据(包括调用用户提供的Map、Reduce函数)、将输出写入特定文件。
实际运行时,会启动一个coordinator进程和多个worker进程(并行)。每个worker进程都会通过RPC(远程过程调用)和coordinator交互,以此来获取任务/返回执行结果。
现实中真正的MapReduce系统,不同的worker进程会运行在不同的机器上,但在这个实验中所有进程都运行在一台机器上。
实现
1.mrcoordinator.go
coordinator程序入口
package main
//
// start the coordinator process, which is implemented
// in ../mr/coordinator.go
//
// go run mrcoordinator.go pg*.txt
//
// Please do not change this file.
//
import (
"fmt"
"os"
"time"
"6.824/mr"
)
func main() {
if len(os.Args) < 2 {
fmt.Fprintf(os.Stderr, "Usage: mrcoordinator inputfiles...\n")
os.Exit(1)
}
m := mr.MakeCoordinator(os.Args[1:], 10)
for m.Done() == false {
time.Sleep(time.Second)
}
time.Sleep(time.Second)
fmt.Printf("调度进程结束 \n")
}
2.coordinator.go
coordinator的具体实现。主要作用是管理任务状态、调度任务。包括计时任务执行时间,超过一定值(这里的实现为10s)则重新分配。
package mr
import (
"errors"
"fmt"
"log"
"net"
"net/http"
"net/rpc"
"os"
"sync"
"time"
)
type Coordinator struct {
processingTasks map[int]*TaskCor // 处理中的任务
unstartTasks map[int]*TaskCor // 未开始的任务
finishedTasks map[int]*TaskCor // 已完成的任务
mu sync.Mutex // 锁
mu2 sync.Mutex // 重入锁
intermediateFiles [][]string // 中间文件列表
nReduce int // Reduce任务的数量,用户自定义参数
outputFiles []string // 输出文件列表
ret bool // 任务是否完成
crashedWorkers []int // 故障的Worker进程
}
type TaskCor struct {
Id int // 任务编号
ExcutedTime int // 任务已执行时间
Category string // 任务类别 map/reduce
Split []string // 输入
ResiponsibleWorkerUid int // 所负责的工人进程的uid
mu sync.Mutex
}
var inputFileNum int
// Your code here -- RPC handlers for the worker to call.
// 响应woker进程的AskForNewTask
func (c *Coordinator) AssignTask(args *AskForNewTaskArgs, reply *AskForNewTaskReply) error {
c.mu.Lock()
if len(c.unstartTasks) == 0 {
err := errors.New("无未开始任务,不分配任务 ")
c.mu.Unlock()
return err
}
if contains(&c.crashedWorkers, args.WorkerUid) != -1 {
err := errors.New("当前请求的woker已被定义为crashed,不分配任务 ")
c.mu.Unlock()
return err
}
for taskId, task := range c.unstartTasks {
fmt.Printf("分配任务 %v \n", task.Id)
replyTask := Task{}
replyTask.Id = task.Id
replyTask.Category = task.Category
replyTask.Split = task.Split
reply.Task = &replyTask
reply.NReduce = c.nReduce
task.ResiponsibleWorkerUid = args.WorkerUid
// 更改任务所在队列 未开始->处理中
delete(c.unstartTasks, taskId)
c.processingTasks[taskId] = task
break
}
c.mu.Unlock()
return nil
}
func (c *Coordinator) MapTaskFinish(args *MapTaskFinishArgs, reply *MapTaskFinishReply) error {
c.mu.Lock()
taskCor := c.processingTasks[args.Task.Id]
if taskCor == nil {
fmt.Printf("MAP-完成任务不在processing列表中,不保存结果 \n")
c.mu.Unlock()
return nil
}
if args.WorkerUid != taskCor.ResiponsibleWorkerUid {
fmt.Printf("MAP-完成woker与任务负责woker uid不符,重新执行 \n")
c.mu.Unlock()
return nil
}
// 接收输出文件
for i := 0; i < len(args.FinishedMidFiles); i++ {
for j := 0; j < len(args.FinishedMidFiles[i]); j++ {
c.intermediateFiles[i] = append(c.intermediateFiles[i], args.FinishedMidFiles[i][j])
}
}
// 更改任务所在队列 处理中->已完成
delete(c.processingTasks, args.Task.Id)
c.finishedTasks[args.Task.Id] = taskCor
// 接收Map输出,应该是所有Map结束后,Reduce才开始
// 根据目前中间文件数量判断是否有Reduce任务加入到unstartTasks中
for i := 0; i < len(c.intermediateFiles); i++ {
if len(c.intermediateFiles[i]) == inputFileNum {
taskCor := TaskCor{}
taskCor.Id = i
taskCor.Category = "Reduce"
taskCor.Split = c.intermediateFiles[i]
c.unstartTasks[taskCor.Id] = &taskCor
}
}
c.mu.Unlock()
return nil
}
func (c *Coordinator) ReduceTaskFinish(args *ReduceTaskFinishArgs, reply *ReduceTaskFinishReply) error {
c.mu.Lock()
taskCor := c.processingTasks[args.Task.Id]
if taskCor == nil {
fmt.Printf("Reduce-完成任务不在processing列表中,不保存结果 \n")
c.mu.Unlock()
return nil
}
if args.WorkerUid != taskCor.ResiponsibleWorkerUid {
fmt.Printf("Reduce-完成woker与任务负责woker uid不符,重新执行 \n")
c.mu.Unlock()
return nil
}
// 把woker进程传回的输出文件记入输出文件队列
c.outputFiles = append(c.outputFiles, args.OutPutFileName)
// 更改任务所在队列 处理中->已完成
delete(c.processingTasks, args.Task.Id)
c.finishedTasks[args.Task.Id] = taskCor
// 通过输出文件队列长度判断整个MapReduce任务是否已完成
if len(c.outputFiles) == c.nReduce {
c.ret = true
}
c.mu.Unlock()
return nil
}
// start a thread that listens for RPCs from worker.go
func (c *Coordinator) server() {
rpc.Register(c)
rpc.HandleHTTP()
//l, e := net.Listen("tcp", ":1234")
sockname := coordinatorSock()
os.Remove(sockname)
l, e := net.Listen("unix", sockname)
if e != nil {
log.Fatal("listen error:", e)
}
go http.Serve(l, nil)
}
// main/mrcoordinator.go calls Done() periodically to find out
// if the entire job has finished.
func (c *Coordinator) Done() bool {
c.mu.Lock()
res := c.ret
c.mu.Unlock()
return res
}
// create a Coordinator.
// main/mrcoordinator.go calls this function.
// nReduce is the number of reduce tasks to use.
func MakeCoordinator(files []string, nReduce int) *Coordinator {
// 初始化Coordinator相关参数
c := Coordinator{}
c.nReduce = nReduce
c.unstartTasks = make(map[int]*TaskCor)
c.processingTasks = make(map[int]*TaskCor)
c.finishedTasks = make(map[int]*TaskCor)
c.ret = false
inputFileNum = len(files)
for i := 0; i < nReduce; i++ {
var curList []string
c.intermediateFiles = append(c.intermediateFiles, curList)
}
// 每个输入文件生成一个Map任务
for i := 0; i < len(files); i++ {
taskCor := TaskCor{}
taskCor.Id = nReduce + 1 + i
taskCor.Category = "Map"
taskCor.Split = append(taskCor.Split, files[i])
c.mu.Lock()
c.unstartTasks[taskCor.Id] = &taskCor
c.mu.Unlock()
}
c.server() // 监听RPC
go TimeTaskExcution(&c) // 开启任务处理计时线程
return &c
}
// 任务处理计时
func TimeTaskExcution(c *Coordinator) {
for {
c.mu.Lock()
// 对所有正在处理中的任务计时
for _, taskCor := range c.processingTasks {
taskCor.mu.Lock()
taskCor.ExcutedTime += 1
taskCor.mu.Unlock()
// 如果当前任务的执行时间超过10s
if taskCor.ExcutedTime >= 10 {
fmt.Printf("待开始任务列表: %v \n", c.unstartTasks)
fmt.Printf("处理中任务列表: %v \n", c.processingTasks)
fmt.Printf("已完成任务列表: %v \n", c.finishedTasks)
fmt.Printf("任务 %v 超时,待重新分配 \n", taskCor.Id)
fmt.Printf("重分配前处理中任务列表: %v \n", c.processingTasks)
reAssignTask(c, taskCor) // 重新分配该任务
fmt.Printf("重分配后处理中任务列表: %v \n", c.processingTasks)
}
}
c.mu.Unlock()
time.Sleep(time.Second) //休眠1s
}
}
// 重新分配超时任务
func reAssignTask(c *Coordinator, taskCor *TaskCor) {
c.mu2.Lock()
// 将先前执行该任务的woker标记为crashed
c.crashedWorkers = append(c.crashedWorkers, taskCor.ResiponsibleWorkerUid)
// 更改任务所在队列 处理中->未开始
delete(c.processingTasks, taskCor.Id)
c.unstartTasks[taskCor.Id] = taskCor
// 重置 任务执行时间、负责woker进程uid
taskCor.mu.Lock()
taskCor.ExcutedTime = 0
taskCor.ResiponsibleWorkerUid = 0
taskCor.mu.Unlock()
c.mu2.Unlock()
}
// 功能函数 检查list中是否有aim
func contains(list *[]int, aim int) int {
for index, value := range *list {
if aim == value {
return index
}
}
return -1
}
3.mrworker.go
worker程序入口
package main
//
// start a worker process, which is implemented
// in ../mr/worker.go. typically there will be
// multiple worker processes, talking to one coordinator.
//
// go run mrworker.go wc.so
//
// Please do not change this file.
//
import (
"fmt"
"log"
"os"
"plugin"
"6.824/mr"
)
func main() {
if len(os.Args) != 2 {
fmt.Fprintf(os.Stderr, "Usage: mrworker xxx.so\n")
os.Exit(1)
}
mapf, reducef := loadPlugin(os.Args[1])
mr.Worker(mapf, reducef)
}
// load the application Map and Reduce functions
// from a plugin file, e.g. ../mrapps/wc.so
func loadPlugin(filename string) (func(string, string) []mr.KeyValue, func(string, []string) string) {
p, err := plugin.Open(filename)
if err != nil {
log.Fatalf("cannot load plugin %v", err.Error())
}
xmapf, err := p.Lookup("Map")
if err != nil {
log.Fatalf("cannot find Map in %v", filename)
}
mapf := xmapf.(func(string, string) []mr.KeyValue)
xreducef, err := p.Lookup("Reduce")
if err != nil {
log.Fatalf("cannot find Reduce in %v", filename)
}
reducef := xreducef.(func(string, []string) string)
return mapf, reducef
}
4.worker.go
worker的具体实现。主要作用是执行Map\Reduce任务。包括读取输入、调用用户提供的mapf\reducef函数、将输出写入文件。
package mr
import (
"encoding/json"
"fmt"
"hash/fnv"
"io/ioutil"
"log"
"net/rpc"
"os"
"sort"
"strconv"
"strings"
"time"
)
// for sorting by key.
type ByKey []KeyValue
// for sorting by key.
func (a ByKey) Len() int { return len(a) }
func (a ByKey) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a ByKey) Less(i, j int) bool { return a[i].Key < a[j].Key }
// Map functions return a slice of KeyValue.
type KeyValue struct {
Key string
Value string
}
// use ihash(key) % NReduce to choose the reduce
// task number for each KeyValue emitted by Map.
func ihash(key string) int {
h := fnv.New32a()
h.Write([]byte(key))
return int(h.Sum32() & 0x7fffffff)
}
type Task struct {
Id int // 任务编号
Category string // 任务类别 map/reduce
Split []string // 输入
}
var waitingTime int
var uid int
// main/mrworker.go calls this function.
func Worker(mapf func(string, string) []KeyValue,
reducef func(string, []string) string) {
uid = int(time.Now().UnixNano())
fmt.Printf("开始进程 %v \n", uid)
for {
reply := AskForNewTask()
if reply == nil || reply.Task == nil {
// 如果请求新任务的返回值为空
time.Sleep(time.Second)
waitingTime += 1
// waitingTime超过一定值 结束线程
if waitingTime >= 30 {
fmt.Printf("进程 %v 退出 \n", uid)
return
}
} else {
// 处理任务
curTask := reply.Task
waitingTime = 0 // 重置waitingTime
if curTask.Category == "Map" {
Map(mapf, reply)
}
if curTask.Category == "Reduce" {
Reduce(reducef, reply)
}
}
}
}
func Map(mapf func(string, string) []KeyValue, reply *AskForNewTaskReply) {
curTask := reply.Task
var midFileMap map[int]*os.File
for _, filename := range curTask.Split {
// 读取输入
filePath := "/root/go/src/6.824/src/main/" + filename
file, err := os.Open(filePath)
if err != nil {
log.Fatalf("cannot open %v", err.Error())
}
content, err := ioutil.ReadAll(file)
if err != nil {
log.Fatalf("cannot read %v", filename)
}
file.Close()
// 使用用户提供的mapf处理输入、排序结果
kva := mapf(filename, string(content))
sort.Sort(ByKey(kva))
// 不同的半成品键值对,处理后需要放到nReduce个不同文件里
// midFileMap <键哈希值%nReduce, 文件>
midFileMap = ConstructMidFileMap(reply.NReduce, curTask.Id)
for _, kv := range kva {
file := midFileMap[ihash(kv.Key)%10]
enc := json.NewEncoder(file)
err := enc.Encode(&kv)
if err != nil {
log.Fatalf("中间键值对写入过程出错" + err.Error())
}
}
}
// 为了防止Map任务还没完成,其输出就被Reduce任务读取
// 在内容完全写入后,再将临时文件名改为最终文件名
var finishedMidFiles [][]string
for i := 0; i < reply.NReduce; i++ {
var curList []string
finishedMidFiles = append(finishedMidFiles, curList)
}
for k, v := range midFileMap {
finalNameSlices := strings.Split(v.Name(), "-")
finalName := "mr-" + finalNameSlices[1] + "-" + finalNameSlices[2]
os.Rename(v.Name(), finalName)
finishedMidFiles[k] = append(finishedMidFiles[k], finalName)
}
// 将输出文件列表回传给coordinator
args := MapTaskFinishArgs{}
args.Task = curTask
args.FinishedMidFiles = finishedMidFiles
args.WorkerUid = uid
ok := call("Coordinator.MapTaskFinish", &args, &reply)
if ok {
fmt.Printf("task %v 已完成\n", reply.Task.Id)
} else {
fmt.Printf("call failed!\n")
}
}
// 初始化 midFileMap <键哈希值%nReduce, 文件>
func ConstructMidFileMap(nReduce int, taskId int) map[int]*os.File {
midFileMap := make(map[int]*os.File)
for i := 0; i < nReduce; i++ {
tmpFileName := "mr-" + strconv.Itoa(taskId) + "-" + strconv.Itoa(i) + "-tmp"
ofile, _ := os.Create(tmpFileName)
midFileMap[i] = ofile
}
return midFileMap
}
func Reduce(reducef func(string, []string) string, reply *AskForNewTaskReply) {
curTask := reply.Task
intermediate := GetIntermediate(curTask.Split) // 读取半成品键值对
// 创建输出文件
outPutFileName := "mr-out-" + strconv.Itoa(curTask.Id)
ofile, _ := os.Create(outPutFileName)
i := 0
for i < len(intermediate) {
// 合并键相同的半成品键值对
j := i + 1
for j < len(intermediate) && intermediate[j].Key == intermediate[i].Key {
j++
}
values := []string{}
for k := i; k < j; k++ {
values = append(values, intermediate[k].Value)
}
// 调用用户提供的reducef处理
output := reducef(intermediate[i].Key, values)
// 将结果写入输出文件
fmt.Fprintf(ofile, "%v %v\n", intermediate[i].Key, output)
i = j
}
// 将输出文件名回传给coordinator
args := ReduceTaskFinishArgs{}
args.Task = curTask
args.OutPutFileName = outPutFileName
args.WorkerUid = uid
ok := call("Coordinator.ReduceTaskFinish", &args, &reply)
if ok {
fmt.Printf("输出文件 %v 已完成\n", outPutFileName)
} else {
fmt.Printf("call failed!\n")
}
}
// 读取所有由该Reduce任务负责的半成品键值对
func GetIntermediate(split []string) []KeyValue {
var intermediate []KeyValue
for index := range split {
file, err := os.Open(split[index])
if err != nil {
log.Fatalf("cannot open %v", split[index])
}
dec := json.NewDecoder(file)
for {
var kv KeyValue
if err := dec.Decode(&kv); err != nil {
break
}
intermediate = append(intermediate, kv)
}
}
sort.Sort(ByKey(intermediate))
return intermediate
}
// 向coordinator请求新任务
func AskForNewTask() *AskForNewTaskReply {
args := AskForNewTaskArgs{}
args.WorkerUid = uid
reply := AskForNewTaskReply{}
reply.Task = nil
ok := call("Coordinator.AssignTask", &args, &reply)
if ok {
fmt.Printf("%v 请求任务 获得任务 %v \n", uid, reply.Task.Id)
return &reply
} else {
fmt.Printf("%v 请求任务 未被分配 \n", uid)
}
return nil
}
// send an RPC request to the coordinator, wait for the response.
// usually returns true.
// returns false if something goes wrong.
func call(rpcname string, args interface{}, reply interface{}) bool {
// c, err := rpc.DialHTTP("tcp", "127.0.0.1"+":1234")
sockname := coordinatorSock()
c, err := rpc.DialHTTP("unix", sockname)
if err != nil {
log.Fatal("dialing:", err)
}
defer c.Close()
err = c.Call(rpcname, args, reply)
if err == nil {
return true
}
fmt.Println(err)
return false
}
5.rpc.go
RPC(远程过程调用)用到的参数、返回值的结构定义。
package mr
//
// RPC definitions.
//
// remember to capitalize all names.
//
import (
"os"
"strconv"
)
// Add your RPC definitions here.
type AskForNewTaskArgs struct {
WorkerUid int
}
type AskForNewTaskReply struct {
Task *Task
NReduce int
}
type MapTaskFinishArgs struct {
Task *Task
FinishedMidFiles [][]string
WorkerUid int
}
type MapTaskFinishReply struct {
}
type ReduceTaskFinishArgs struct {
Task *Task
OutPutFileName string
WorkerUid int
}
type ReduceTaskFinishReply struct {
}
// Cook up a unique-ish UNIX-domain socket name
// in /var/tmp, for the coordinator.
// Can't use the current directory since
// Athena AFS doesn't support UNIX-domain sockets.
func coordinatorSock() string {
s := "/var/tmp/824-mr-"
s += strconv.Itoa(os.Getuid())
return s
}
结果
Lab提供了一个测试脚本,会进行以下测试:
- 针对word-count和indexer两个MapReduce任务,该系统的输出结果与顺序执行的输出结果是否相同。
- 该系统的worker进程是否实现了并行。
- Map/Reduce任务执行次数是否正常。
- 是否有进程在MapReduce任务完成以前退出。
- 是否能在部分worker进程崩溃的情况下完成MapReduce任务。
上面的代码通过了所有测试。