一.准备工作
1.需要依赖的核心jar包,maven工程里引入的核心如下:
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-jobclient</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.mongodb</groupId>
<artifactId>casbah-core_2.11</artifactId>
<version>3.1.1</version>
</dependency>
<dependency>
<groupId>org.mongodb</groupId>
<artifactId>mongo-hadoop</artifactId>
<version>1.0.0</version>
</dependency>
<!--连接mongodb的核心jar包-->
<dependency>
<groupId>org.mongodb.mongo-hadoop</groupId>
<artifactId>mongo-hadoop-core</artifactId>
<version>2.0.1</version>
</dependency>
<!--连接mongodb的核心jar包-->
<dependency>
<groupId>org.mongodb</groupId>
<artifactId>mongo-java-driver</artifactId>
<version>3.3.0</version>
</dependency>
二.直接上代码:
map和reduce里面的自定义代码,按照自己的需求写就好,主要看run的方法代码
public class MongoConnectTest extends Configured implements Tool{
private static class MongoMapper extends Mapper<Object, BSONObject, Text, IntWritable> {
//定义输出key的类型
Text word = new Text();
@Override
protected void map(Object key, BSONObject value, Context context)
throws IOException, InterruptedException {
//获取mongo里的gold标签
if (value.get("gold") != null) {
ArrayList<BSONObject> gold= (ArrayList<BSONObject>) value.get("gold");
for (int i=0 ; i < gold.size();i++) {
//由于interest的tag_id比较集中,所以加上一个6位数的随机数,避免shufflue过程会数据倾斜
//int random = (int) ((Math.random() * 90)+10);
//String tag_id = random+interest.get(i).get("tag_id").toString();
String tag_id = interest.get(i).get("tag_id").toString()+"i";
word.set(tag_id);
//context处理后的数据("1254681100010011,1),125468为随机数
context.write(word, new IntWritable(1));
}
}
}
}
private static class MongoReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
//定义输出key和value的类型
IntWritable value = new IntWritable();
Text word = new Text();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
//统计map端输出的数据
int sum = 0;
for (IntWritable value : values) {
sum += value.get();
}
value.set(sum);
//String substring = key.toString().substring(2, key.toString().length());
//word.set(substring);
word.set(key.toString());
System.out.println(word + ":" + value);
context.write(word, value);
}
}
public int run(String[] args) throws Exception {
Configuration conf = new Configuration();
//配置mongoinput的路径,如果处理后的数据还是放到mongo里,要配置output路径(和input路径格式一样写就好),这里就不配置output路径了,结果输出到hdfs上
conf.set("mongo.input.uri", "mongodb://testuser:testuser@10.10.8.102:20000,10.10.8.104:20000,10.10.8.107:20000/test.mobile_user");
//conf.set("mongo.output.uri", "mongodb://testuser:testuser@10.10.8.102:20000,10.10.8.104:20000,10.10.8.107:20000/test.mobile_user");
Job job = Job.getInstance(conf, "Mongo Connection");
job.setJarByClass(MongoConnectTest.class);
job.setMapperClass(MongoMapper.class);
//map端局部聚合,减少大量shuffle过程
job.setCombinerClass(MongoReducer.class);
job.setReducerClass(MongoReducer.class);
job.setInputFormatClass(MongoInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//Path inputPath = new Path(args[0]);
//结果输出路径,参数可以在执行jar包时外面传参,也可以直接将路径写死
Path outputPath = new Path(args[0]);
FileSystem fs = FileSystem.get(conf);
if(fs.exists(outputPath)){
fs.delete(outputPath, true);
}
// FileInputFormat.setInputPaths(job, inputPath);
FileOutputFormat.setOutputPath(job,outputPath);
int waitForCompletion = job.waitForCompletion(true) ? 0 : 1;
//System.out.println(waitForCompletion);
return waitForCompletion;
}
public static void main(String[] args) throws Exception{
Configuration conf = new Configuration();
int exitCode = ToolRunner.run(conf,new MongoConnectTest(),args);
System.exit(exitCode);
}
}
三.打jar包到hadoop集群上运行。
$HADOOP_HOME/bin/hadoop jar /.....jar包的路径../MongoConnectTest .jar com.java.MongoConnectTest /tmp/output1/
/tmp/output1/ 为传入一个参数,结果输出到HDFS上的路径
这里需要注意的时,如果运行jar包时报错, NoClassDefFoundError等找不到类加载路径时,可能是打jar包时依赖jar包找不到,
可以在桌面上建一个文件名字为lib的文件夹,然后把依赖jar包放进去,再把lib文件夹直接拖到你打好待运行的jar里(双击MongoConnectTest .jar拖进去)