以最简单的统计词频为例,我们只需要简单的写两个函数,就可以搭建起一个简单的服务集群
(1) Map和Reduce 函数
(2)MapReduceSpecification函数( 貌似有专门针对C++的函数库)
下面的这个连接是对谷歌《MapReduce: Simplified Data Processing on Large Clusters》论文的翻译
【2】MapReduce超大集群的简单数据处理
Java环境下对MapReduce的设置
【3】http://blog.csdn.net/xiaotom5/article/details/8074791
下面是统计词频的源代码
1 #include "mapreduce/mapreduce.h"
2
3 // User's map function
4 class WordCounter : public Mapper {
5 public:
6 virtual void Map(const MapInput& input) {
7 const string& text = input.value();
8 const int n = text.size();
9 for (int i = 0; i < n; ) {
10 // Skip past leading whitespace
11 while ((i < n) && isspace(text))
12 i++;
13
14 // Find word end
15 int start = i;
16 while ((i < n) && !isspace(text))
17 i++;
18 if (start < i)
19 Emit(text.substr(start,i-start),"1");
20 }
21 }
22 };
23
24 REGISTER_MAPPER(WordCounter);
25
26 // User's reduce function
27 class Adder : public Reducer {
28 virtual void Reduce(ReduceInput* input) {
29 // Iterate over all entries with the
30 // same key and add the values
31 int64 value = 0;
32 while (!input->done()) {
33 value += StringToInt(input->value());
34 input->NextValue();
35 }
36
37 // Emit sum for input->key()
38 Emit(IntToString(value));
39 }
40 };
41
42 REGISTER_REDUCER(Adder);
43
44 int main(int argc, char** argv) {
45 ParseCommandLineFlags(argc, argv);
46
47 MapReduceSpecification spec;
48
49 // Store list of input files into "spec"
50 for (int i = 1; i < argc; i++) {
51 MapReduceInput* input = spec.add_input();
52 input->set_format("text");
53 input->set_filepattern(argv);
54 input->set_mapper_class("WordCounter");
55 }
56
57 // Specify the output files:
58 // /gfs/test/freq-00000-of-00100
59 // /gfs/test/freq-00001-of-00100
60 //
61 MapReduceOutput* out = spec.output();
62 out->set_filebase("/gfs/test/freq");
63 out->set_num_tasks(100);
64 out->set_format("text");
65 out->set_reducer_class("Adder");
66
67 // Optional: do partial sums within map
68 // tasks to save network bandwidth
69 out->set_combiner_class("Adder");
70
71 // Tuning parameters: use at most 2000
72 // machines and 100 MB of memory per task
73 spec.set_machines(2000);
74 spec.set_map_megabytes(100);
75 spec.set_reduce_megabytes(100);
76
77 // Now run it
78 MapReduceResult result;
79 if (!MapReduce(spec, &result)) abort();
80
81 // Done: 'result' structure contains info
82 // about counters, time taken, number of
83 // machines used, etc.
84 return 0;
85 }
86