mongodb是NoSQl领域里非常流行的一款非关系型数据库,提供了强大的分片存储与查询功能,用来做历史数据(日志)存储与查询比较适合,本身也提供了mapreduce功能,但是并不是任何时候Mongodb的使用者都会使用分片功能,更大的可能是使用副本集的方式(有时候机器并不多),而Hadoop提供了HDFS和分布式计算的功能,我们可以利用hadoop的MapReduce来取代Mongodb的MapReduce,用Mongodb的副本集来取代Hadoop的HDFS,那么就有了Hadoop与Mongodb之间的连接器(adapter)mongo-hadoop-master项目(目前在github上课可以下载到)
一 :下载地址:https://github.com/mongodb/mongo-hadoop
二: 下载之后解压:
[root@bigdata2 software]# cd mongo-hadoop-master [root@bigdata2 mongo-hadoop-master]# ll total 140 drwxr-xr-x 3 root root 4096 Oct 15 11:53 bin -rw-r--r-- 1 root root 5848 Oct 15 11:53 BSON_README.md drwxr-xr-x 4 root root 4096 Nov 30 13:06 build -rwxr-xr-x 1 root root 168 Oct 15 11:53 build-all.sh -rw-r--r-- 1 root root 12731 Oct 15 11:53 build.gradle drwxr-xr-x 2 root root 4096 Oct 15 11:53 clusterConfigs drwxr-xr-x 2 root root 4096 Oct 15 11:53 config -rw-r--r-- 1 root root 7458 Oct 15 11:53 CONFIG.md drwxr-xr-x 4 root root 4096 Nov 30 13:06 core drwxr-xr-x 6 root root 4096 Oct 15 11:53 docs drwxr-xr-x 7 root root 4096 Oct 15 11:53 examples drwxr-xr-x 3 root root 4096 Oct 15 11:53 flume drwxr-xr-x 3 root root 4096 Oct 15 11:53 gradle -rwxr-xr-x 1 root root 5080 Oct 15 11:53 gradlew -rw-r--r-- 1 root root 2314 Oct 15 11:53 gradlew.bat -rw-r--r-- 1 root root 1862 Oct 15 11:53 History.md drwxr-xr-x 3 root root 4096 Oct 15 11:53 hive drwxr-xr-x 3 root root 4096 Oct 15 11:53 integration-tests -rw-r--r-- 1 root root 6764 Oct 15 11:53 mongo-defaults.xml -rw------- 1 root root 4843 Nov 30 13:12 nohup.out drwxr-xr-x 3 root root 4096 Oct 15 11:53 pig -rw-r--r-- 1 root root 5106 Oct 15 11:53 README.md -rw-r--r-- 1 root root 137 Oct 15 11:53 settings.gradle drwxr-xr-x 5 root root 4096 Oct 15 11:53 streaming -rwxr-xr-x 1 root root 682 Oct 15 11:53 test.sh drwxr-xr-x 2 root root 4096 Oct 15 11:53 tools [root@bigdata2 mongo-hadoop-master]#
其中Example目录是自带的测试案例,我这里会采用mongo-hadoop-master/examples/treasury_yield 这个案例里面的src/main/resources/下面哦json数据
{ "_id" : { "$date" : 631324800000 }, "dayOfWeek" : "WEDNESDAY", "bc3Year" : 7.96, "bc5Year" : 7.92, "bc10Year" : 7.99, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 7.94, "bc3Month" : 7.89, "bc30Year" : 8.039999999999999, "bc1Year" : 7.85, "bc7Year" : 8.039999999999999, "bc6Month" : 7.94 }
{ "_id" : { "$date" : 631411200000 }, "dayOfWeek" : "THURSDAY", "bc3Year" : 7.93, "bc5Year" : 7.91, "bc10Year" : 7.98, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 7.92, "bc3Month" : 7.84, "bc30Year" : 8.039999999999999, "bc1Year" : 7.82, "bc7Year" : 8.02, "bc6Month" : 7.9 }
{ "_id" : { "$date" : 631497600000 }, "dayOfWeek" : "FRIDAY", "bc3Year" : 7.94, "bc5Year" : 7.92, "bc10Year" : 7.99, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 7.9, "bc3Month" : 7.79, "bc30Year" : 8.06, "bc1Year" : 7.79, "bc7Year" : 8.029999999999999, "bc6Month" : 7.85 }
{ "_id" : { "$date" : 631756800000 }, "dayOfWeek" : "MONDAY", "bc3Year" : 7.95, "bc5Year" : 7.92, "bc10Year" : 8.02, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 7.9, "bc3Month" : 7.79, "bc30Year" : 8.09, "bc1Year" : 7.81, "bc7Year" : 8.050000000000001, "bc6Month" : 7.88 }
{ "_id" : { "$date" : 631843200000 }, "dayOfWeek" : "TUESDAY", "bc3Year" : 7.94, "bc5Year" : 7.92, "bc10Year" : 8.02, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 7.91, "bc3Month" : 7.8, "bc30Year" : 8.1, "bc1Year" : 7.78, "bc7Year" : 8.050000000000001, "bc6Month" : 7.82 }
{ "_id" : { "$date" : 631929600000 }, "dayOfWeek" : "WEDNESDAY", "bc3Year" : 7.95, "bc5Year" : 7.92, "bc10Year" : 8.029999999999999, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 7.91, "bc3Month" : 7.75, "bc30Year" : 8.109999999999999, "bc1Year" : 7.77, "bc7Year" : 8, "bc6Month" : 7.78 }
{ "_id" : { "$date" : 632016000000 }, "dayOfWeek" : "THURSDAY", "bc3Year" : 7.95, "bc5Year" : 7.94, "bc10Year" : 8.039999999999999, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 7.91, "bc3Month" : 7.8, "bc30Year" : 8.109999999999999, "bc1Year" : 7.77, "bc7Year" : 8.01, "bc6Month" : 7.8 }
{ "_id" : { "$date" : 632102400000 }, "dayOfWeek" : "FRIDAY", "bc3Year" : 7.98, "bc5Year" : 7.99, "bc10Year" : 8.1, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 7.93, "bc3Month" : 7.74, "bc30Year" : 8.17, "bc1Year" : 7.76, "bc7Year" : 8.07, "bc6Month" : 7.81 }
{ "_id" : { "$date" : 632448000000 }, "dayOfWeek" : "TUESDAY", "bc3Year" : 8.130000000000001, "bc5Year" : 8.109999999999999, "bc10Year" : 8.199999999999999, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 8.1, "bc3Month" : 7.89, "bc30Year" : 8.25, "bc1Year" : 7.92, "bc7Year" : 8.18, "bc6Month" : 7.99 }
{ "_id" : { "$date" : 632534400000 }, "dayOfWeek" : "WEDNESDAY", "bc3Year" : 8.109999999999999, "bc5Year" : 8.109999999999999, "bc10Year" : 8.19, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 8.09, "bc3Month" : 7.97, "bc30Year" : 8.25, "bc1Year" : 7.91, "bc7Year" : 8.17, "bc6Month" : 7.97 }
{ "_id" : { "$date" : 632620800000 }, "dayOfWeek" : "THURSDAY", "bc3Year" : 8.279999999999999, "bc5Year" : 8.27, "bc10Year" : 8.32, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 8.25, "bc3Month" : 8.039999999999999, "bc30Year" : 8.35, "bc1Year" : 8.050000000000001, "bc7Year" : 8.31, "bc6Month" : 8.08 }
{ "_id" : { "$date" : 632707200000 }, "dayOfWeek" : "FRIDAY", "bc3Year" : 8.23, "bc5Year" : 8.199999999999999, "bc10Year" : 8.26, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 8.199999999999999, "bc3Month" : 8, "bc30Year" : 8.289999999999999, "bc1Year" : 8, "bc7Year" : 8.24, "bc6Month" : 8.01 }
{ "_id" : { "$date" : 632966400000 }, "dayOfWeek" : "MONDAY", "bc3Year" : 8.199999999999999, "bc5Year" : 8.19, "bc10Year" : 8.27, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 8.18, "bc3Month" : 7.99, "bc30Year" : 8.31, "bc1Year" : 7.98, "bc7Year" : 8.25, "bc6Month" : 7.99 }
{ "_id" : { "$date" : 633052800000 }, "dayOfWeek" : "TUESDAY", "bc3Year" : 8.199999999999999, "bc5Year" : 8.18, "bc10Year" : 8.26, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 8.18, "bc3Month" : 7.93, "bc30Year" : 8.289999999999999, "bc1Year" : 7.97, "bc7Year" : 8.23, "bc6Month" : 7.97 }
{ "_id" : { "$date" : 633139200000 }, "dayOfWeek" : "WEDNESDAY", "bc3Year" : 8.289999999999999, "bc5Year" : 8.279999999999999, "bc10Year" : 8.380000000000001, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 8.199999999999999, "bc3Month" : 7.93, "bc30Year" : 8.41, "bc1Year" : 8, "bc7Year" : 8.34, "bc6Month" : 7.99 }
{ "_id" : { "$date" : 633225600000 }, "dayOfWeek" : "THURSDAY", "bc3Year" : 8.32, "bc5Year" : 8.31, "bc10Year" : 8.42, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 8.24, "bc3Month" : 7.95, "bc30Year" : 8.460000000000001, "bc1Year" : 8.029999999999999, "bc7Year" : 8.390000000000001, "bc6Month" : 8.01 }
{ "_id" : { "$date" : 633312000000 }, "dayOfWeek" : "FRIDAY", "bc3Year" : 8.380000000000001, "bc5Year" : 8.380000000000001, "bc10Year" : 8.49, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 8.279999999999999, "bc3Month" : 7.93, "bc30Year" : 8.550000000000001, "bc1Year" : 8.07, "bc7Year" : 8.449999999999999, "bc6Month" : 8.039999999999999 }
{ "_id" : { "$date" : 633571200000 }, "dayOfWeek" : "MONDAY", "bc3Year" : 8.390000000000001, "bc5Year" : 8.390000000000001, "bc10Year" : 8.5, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 8.300000000000001, "bc3Month" : 8, "bc30Year" : 8.539999999999999, "bc1Year" : 8.08, "bc7Year" : 8.449999999999999, "bc6Month" : 8.09 }
{ "_id" : { "$date" : 633657600000 }, "dayOfWeek" : "TUESDAY", "bc3Year" : 8.390000000000001, "bc5Year" : 8.43, "bc10Year" : 8.51, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 8.300000000000001, "bc3Month" : 8, "bc30Year" : 8.550000000000001, "bc1Year" : 8.09, "bc7Year" : 8.470000000000001, "bc6Month" : 8.140000000000001 }
{ "_id" : { "$date" : 633744000000 }, "dayOfWeek" : "WEDNESDAY", "bc3Year" : 8.359999999999999, "bc5Year" : 8.35, "bc10Year" : 8.43, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 8.279999999999999, "bc3Month" : 8, "bc30Year" : 8.460000000000001, "bc1Year" : 8.08, "bc7Year" : 8.390000000000001, "bc6Month" : 8.130000000000001 }
{ "_id" : { "$date" : 633830400000 }, "dayOfWeek" : "THURSDAY", "bc3Year" : 8.35, "bc5Year" : 8.35, "bc10Year" : 8.42, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 8.279999999999999, "bc3Month" : 8.02, "bc30Year" : 8.44, "bc1Year" : 8.09, "bc7Year" : 8.380000000000001, "bc6Month" : 8.130000000000001 }
{ "_id" : { "$date" : 633916800000 }, "dayOfWeek" : "FRIDAY", "bc3Year" : 8.43, "bc5Year" : 8.42, "bc10Year" : 8.5, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 8.369999999999999, "bc3Month" : 8.07, "bc30Year" : 8.51, "bc1Year" : 8.130000000000001, "bc7Year" : 8.460000000000001, "bc6Month" : 8.17 }
{ "_id" : { "$date" : 634176000000 }, "dayOfWeek" : "MONDAY", "bc3Year" : 8.43, "bc5Year" : 8.44, "bc10Year" : 8.529999999999999, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 8.369999999999999, "bc3Month" : 8.08, "bc30Year" : 8.529999999999999, "bc1Year" : 8.15, "bc7Year" : 8.48, "bc6Month" : 8.18 }
{ "_id" : { "$date" : 634262400000 }, "dayOfWeek" : "TUESDAY", "bc3Year" : 8.43, "bc5Year" : 8.49, "bc10Year" : 8.57, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 8.42, "bc3Month" : 8.09, "bc30Year" : 8.58, "bc1Year" : 8.15, "bc7Year" : 8.52, "bc6Month" : 8.17 }
{ "_id" : { "$date" : 634348800000 }, "dayOfWeek" : "WEDNESDAY", "bc3Year" : 8.43, "bc5Year" : 8.51, "bc10Year" : 8.52, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 8.42, "bc3Month" : 8.08, "bc30Year" : 8.57, "bc1Year" : 8.17, "bc7Year" : 8.529999999999999, "bc6Month" : 8.19 }
{ "_id" : { "$date" : 634435200000 }, "dayOfWeek" : "THURSDAY", "bc3Year" : 8.390000000000001, "bc5Year" : 8.449999999999999, "bc10Year" : 8.49, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 8.369999999999999, "bc3Month" : 8.08, "bc30Year" : 8.5, "bc1Year" : 8.130000000000001, "bc7Year" : 8.48, "bc6Month" : 8.18 }
{ "_id" : { "$date" : 634521600000 }, "dayOfWeek" : "FRIDAY", "bc3Year" : 8.24, "bc5Year" : 8.289999999999999, "bc10Year" : 8.31, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 8.25, "bc3Month" : 8.02, "bc30Year" : 8.359999999999999, "bc1Year" : 8.029999999999999, "bc7Year" : 8.34, "bc6Month" : 8.09 }
三: 我们查看他的README.md,可以看出 ,需要编译
## Building The mongo-hadoop connector currently supports the following versions of hadoop: 0.23, 1.0, 1.1, 2.2, 2.3, 2.4, and CDH 4 abd 5. The default build version will build against the last Apache Hadoop (currently 2.4). If you would like to build against a specific version of Hadoop you simply need to pass `-PclusterVersion=<your version>` to gradlew when building. Run `./gradlew jar` to build the jars. The jars will be placed in to `build/libs` for each module. e.g. for the core module, it will be generated in the `core/build/libs` directory. After successfully building, you must copy the jars to the lib directory on each node in your hadoop cluster. This is usually one of the following locations, depending on which Hadoop release you are using: * `$HADOOP_HOME/lib/` * `$HADOOP_HOME/share/hadoop/mapreduce/` * `$HADOOP_HOME/share/hadoop/lib/` ## Supported Distributions of Hadoop | Hadoop Version | Build Parameter | | :----------------------------------: | :---------------------: | | Apache Hadoop 0.23 | -PclusterVersion='0.23' | | Apache Hadoop 1.0 | -PclusterVersion='1.0' | | Apache Hadoop 1.1 | -PclusterVersion='1.1' | | Apache Hadoop 2.2 | -PclusterVersion='2.2' | | Apache Hadoop 2.3 | -PclusterVersion='2.3' | | Apache Hadoop 2.4 | -PclusterVersion='2.4' | --More--(49%)
我们按照下面指令编译:
./gradlew jar
编译过程比较缓慢,下载一个较大的软件是amazon的s3,有250多M,完成以后,会在core/build/libs目录下生成Jar包 mongo-hadoop-core-1.4.0-SNAPSHOT.jar(最大的战斗成果。。) ,我们带上JAVA连接MongoDb的驱动,一起拷贝到$hadoop_home/lib里面 ,当然也可以采用运行时加载的方法
DistributedCache.addFileToClassPath(new Path("/root/software/mongo-java-driver-2.11.1.jar"), conf);
DistributedCache.addFileToClassPath(new Path("/root/software/mongo-hadoop-core-1.4.0-SNAPSHOT.jar"), conf);
有了编译好的驱动,我们就可以用它来连接Mongodb了。
四:首先我们准备数据,把刚才的数据导入到mongodb
mongoimport --host 127.0.0.1 --port 27017 -d testmr -c example --file ./yield_historical_in.json
查看数据:
example
mongotest
system.indexes
> db.example.find().limit(2);
{ "_id" : ISODate("1990-01-02T00:00:00Z"), "dayOfWeek" : "TUESDAY", "bc3Year" :
7.9, "bc5Year" : 7.87, "bc10Year" : 7.94, "bc20Year" : null, "bc1Month" : null,
"bc2Year" : 7.87, "bc3Month" : 7.83, "bc30Year" : 8, "bc1Year" : 7.81, "bc7Year"
: 7.98, "bc6Month" : 7.89 }
{ "_id" : ISODate("1990-01-03T00:00:00Z"), "dayOfWeek" : "WEDNESDAY", "bc3Year"
: 7.96, "bc5Year" : 7.92, "bc10Year" : 7.99, "bc20Year" : null, "bc1Month" : nul
l, "bc2Year" : 7.94, "bc3Month" : 7.89, "bc30Year" : 8.04, "bc1Year" : 7.85, "bc
7Year" : 8.04, "bc6Month" : 7.94 }
>
五:新建一个MapReduce工程
import java.io.IOException;
import java.util.Date;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Mapper;
import org.bson.BSONObject;
public class MongoTestMapper extends Mapper<Object,BSONObject, IntWritable, DoubleWritable> {
@Override
public void map(final Object pkey, final BSONObject pvalue,final Context context)
{
final int year = ((Date)pvalue.get("_id")).getYear()+1990;
double bdyear = ((Number)pvalue.get("bc10Year")).doubleValue();
try {
context.write( new IntWritable( year ), new DoubleWritable( bdyear ));
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
public class MongoTestReducer extends Reducer<IntWritable,DoubleWritable,IntWritable,BSONWritable>
{
public void reduce( final IntWritable pKey,
final Iterable<DoubleWritable> pValues,
final Context pContext ) throws IOException, InterruptedException{
int count = 0;
double sum = 0.0;
for ( final DoubleWritable value : pValues ){
sum += value.get();
count++;
}
final double avg = sum / count;
BasicBSONObject out = new BasicBSONObject();
out.put("avg", avg);
pContext.write(pKey, new BSONWritable(out));
}
}
这是一个计算平均值的例子的部分代码,之后在Hadoop环境上运行,可以看到输出到Mongodb的结果
{ "_id" : 2080, "avg" : 8.552400000000002 }
{ "_id" : 2081, "avg" : 7.8623600000000025 }
{ "_id" : 2082, "avg" : 7.008844621513946 }
{ "_id" : 2083, "avg" : 5.866279999999999 }
{ "_id" : 2084, "avg" : 7.085180722891565 }
{ "_id" : 2085, "avg" : 6.573920000000002 }
{ "_id" : 2086, "avg" : 6.443531746031742 }
{ "_id" : 2087, "avg" : 6.353959999999992 }
{ "_id" : 2088, "avg" : 5.262879999999994 }
{ "_id" : 2089, "avg" : 5.646135458167332 }
{ "_id" : 2090, "avg" : 6.030278884462145 }
{ "_id" : 2091, "avg" : 5.02068548387097 }
{ "_id" : 2092, "avg" : 4.61308 }
{ "_id" : 2093, "avg" : 4.013879999999999 }
{ "_id" : 2094, "avg" : 4.271320000000004 }
{ "_id" : 2095, "avg" : 4.288880000000001 }
{ "_id" : 2096, "avg" : 4.7949999999999955 }
{ "_id" : 2097, "avg" : 4.634661354581674 }
{ "_id" : 2098, "avg" : 3.6642629482071714 }
{ "_id" : 2099, "avg" : 3.2641200000000037 }
Type "it" for more