linux 命令行如何模拟map-reduce的实现过程

最新推荐文章于 2022-04-06 11:11:32 发布

timegoesby_001

最新推荐文章于 2022-04-06 11:11:32 发布

阅读量1.4k

点赞数 1

分类专栏：搜索技术大数据处理

本文链接：https://blog.csdn.net/cloudusers/article/details/15811363

版权

搜索技术同时被 2 个专栏收录

26 篇文章 0 订阅

订阅专栏

大数据处理

18 篇文章 0 订阅

订阅专栏

提起map-reduce ，典型的示例就是词频统计，例如统计词典中单词出现的次数等

为什么总用这个示例来说明呢，其实map-reduce就是一个shell 命令的串联过程，只不过用集群的多机来处理，

例如有 test.data , map , reduce测试数据和程序文件

则map-reduce可以本地概括为： cat test.data | ./map | sort -k1 | ./reduce > res.data

这个用管道串起来的命令行，之所以map输出和reduce间有个按第一列排序，其实这个就是map-reduce间的关系， map的输出作为reduce的输入，但这中间隐含了sort过程（按第一列）

所以总是用词频统计来介绍map-reduce的使用，另外本地写完map-reduce程序时也可以本地测一下，预期正常的话可以提到hadoop上进行任务处理

// res.data

a|1   2
a|3   2
b|2   1
b|4   1
c|2   1
d|1   1
e|5   2

// test.data /

a   1
b   2
a   3
b   4
a   1
c   2
a   3
d   1
e   5
e   5

/// map.cpp ///

#include <iostream>

#include <vector>
#include <boost/regex.hpp>
#include <boost/algorithm/string.hpp>
#include <boost/algorithm/string/split.hpp>
#include <boost/algorithm/string/classification.hpp>

using namespace std;
using namespace boost;

vector<string> svec;

/*-----------------------------------------------------------
------------------------------------------------------------*/
void process(std::string str) {
        /*do-something*/
        boost::split( svec, str, boost::is_any_of( "\t" ), boost::token_compress_on );
        if(2 == svec.size()){
                trim(svec[0]);
                trim(svec[1]);
                printf("%s|%s\n", (svec[0]).c_str(), (svec[1]).c_str());
        }
}

/*-----------------------------------------------------------
------------------------------------------------------------*/
int main(int argc, char** argv) {
        const int BUF_SIZE = 1024;
        char buf[BUF_SIZE] = {0};
        boost::regex reg(".+");

        while(!feof(stdin)) {
                if(NULL == fgets(buf, BUF_SIZE, stdin)){
                        continue;
                }
                bool yn = boost::regex_match(buf, reg);
                if( yn ){
                        process(buf);
                }
        }

        return 0;

}

reduce.cpp ///

#include <iostream>
#include <map>
#include <boost/algorithm/string.hpp>
#include <boost/algorithm/string/split.hpp>
#include <boost/algorithm/string/classification.hpp>

using namespace std;
using namespace boost;

typedef map<string,int> smap;
typedef map<string,int>::iterator mapiter;

smap keyval;
mapiter mit;

/*-----------------------------------------------------------
------------------------------------------------------------*/
void process() {
    /*do-something*/
   for(mit=keyval.begin(); mit != keyval.end(); ++mit) {
       cout<<mit->first<<"\t"<<mit->second<<endl;
   }

   keyval.clear();
   return;
}

/*-----------------------------------------------------------
------------------------------------------------------------*/
int main(int argc, char** argv) {
   const int BUF_SIZE = 1024;
   char buf[BUF_SIZE];
   string key, last_str;
   const int default_val = 1;

   while(!feof(stdin)) {
       if(NULL == fgets(buf, BUF_SIZE, stdin)){
           continue;
       }

       key = buf;
                trim(key);
       if( key == last_str ) {//exist
           mit = keyval.find( key );
           if( keyval.end() == mit ) {//first log
               keyval.insert(pair<string,int>(key, default_val));
           } else {//update value
               int tmpval = default_val + mit->second;
               keyval.erase(mit);
               keyval.insert(pair<string,int>(key,tmpval));
           }
       } else {
           last_str = key;

           process();
           keyval.insert(pair<string,int>(key, default_val));
       }
   }

   //process last buf
   process();
   return 0;
}