接上一篇文章:Linux环境下配置Eclipse和Maven
此篇里面的数据是我参与开发的一个微信公众号的用户信息,通过MapReduce来统计每个月的注册人数。(由于涉及到用户隐私,所以数据不公开)。
在该表中,有一列create_time,这是用户注册的时间。
1、将/opt/cdh5.14.2/hadoop-2.6.0/etc/hadoop目录下的三个文件复制到resource目录下
2、将表数据上传到hdfs
3、Mapper类
package com.zzw.mapreduce.userinfo;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class UserInfoMapper extends
Mapper<LongWritable, Text, Text, IntWritable> {
private Text mapOutputKey = new Text();
private IntWritable mapOutputValue = new IntWritable(1);
@Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String lineValue = value.toString();
String[] splits = lineValue.split("\\t");
if ((null != splits) && (splits.length == 13)) {
String createTime = splits[11];
System.out.println("createTime=" + createTime);
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM");
String str_createTime = sdf.format(new Date(Long
.parseLong(createTime)));
mapOutputKey.set(str_createTime);
context.write(mapOutputKey, mapOutputValue);
}
}
}
4、Reduce类
package com.zzw.mapreduce.userinfo;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class UserInfoReducer extends
Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable outputValue = new IntWritable();
@Override
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable value : values) {
sum += value.get();
}
outputValue.set(sum);
context.write(key, outputValue);
}
}
5、UserInfoMapReduce类
package com.zzw.mapreduce.userinfo;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class UserInfoMapReduce {
public int run(String args[]) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, this.getClass().getSimpleName());
job.setJarByClass(this.getClass());
// setJob
// input
Path inPath = new Path(args[0]);
FileInputFormat.addInputPath(job, inPath);
// outPut
Path outPath = new Path(args[1]);
FileOutputFormat.setOutputPath(job, outPath);
// Mapper
job.setMapperClass(UserInfoMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// Reducer
job.setReducerClass(UserInfoReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// submit job
boolean isSuccess = job.waitForCompletion(true);
return isSuccess ? 0 : 1;
}
public static void main(String[] args) throws Exception {
args = new String[] { "hdfs://master.cdh.com:8020/input/bbc/userinfo",
"hdfs://master.cdh.com:8020/output/userinfo7" };
// run job
int status = new UserInfoMapReduce().run(args);
System.exit(status);
}
}
6、运行结果
查看数据结果:
[root@master hadoop-2.6.0]# bin/hdfs dfs -text /output/userinfo7/part*
18/05/20 17:19:09 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2017-10 17
2017-11 28
2017-12 41
2018-01 18
2018-02 14
2018-03 32
2018-04 13
2018-05 4
[root@master hadoop-2.6.0]#