1 importjava.io.IOException;2
3 importorg.apache.hadoop.conf.Configuration;4 importorg.apache.hadoop.conf.Configured;5 importorg.apache.hadoop.fs.Path;6 importorg.apache.hadoop.io.IntWritable;7 importorg.apache.hadoop.io.LongWritable;8 importorg.apache.hadoop.io.Text;9 importorg.apache.hadoop.mapreduce.Job;10 importorg.apache.hadoop.mapreduce.Mapper;11 importorg.apache.hadoop.mapreduce.Reducer;12 importorg.apache.hadoop.mapreduce.lib.input.FileInputFormat;13 importorg.apache.hadoop.mapreduce.lib.output.FileOutputFormat;14 importorg.apache.hadoop.util.Tool;15 importorg.apache.hadoop.util.ToolRunner;16
17 /**
18 *https://mp.weixin.qq.com/s/3qQqN6qzQ3a8_Au2qfZnVg
19 * 针对[新兴生态系统:Python和R语言,谁更适用于大数据Spark/Hadoop和深度学习?]20 * 的全球数据进行一系列统计21 */
22 public class wechat extends Configured implementsTool {23
24 /**
25 * Map方法26 */
27 private static class ModuleMapper extends Mapper{28 private static final IntWritable mapOutputValue = new IntWritable(1) ;29 private Text mapOutputKey = newText() ;30 @Override31 public voidmap(LongWritable key, Text value, Context context)32 throwsIOException, InterruptedException {33
34 String input =value.toString();35 if(input.split(",").length<16) {36 return;37 }38 String[] arrStr = input.split(",");39 //Python-大数据计数器输出
40 if("1".equals(arrStr[2])&&"1".equals(arrStr[14])) {41 context.getCounter("WECHAT_MAPPER_COUNTERS", "Python_BigData").increment(1L);42 }43 //Python-Deep计数器输出
44 if("1".equals(arrStr[2])&&"1".equals(arrStr[13])) {45 context.getCounter("WECHAT_MAPPER_COUNTERS", "Python_Deep-Learning").increment(1L);46 }47 //R-大数据计数器输出
48 if("1".equals(arrStr[3])&&"1".equals(arrStr[14])) {49 context.getCounter("WECHAT_MAPPER_COUNTERS", "R_BigData").increment(1L);50 }51 //R-深度计数器输出
52 if("1".equals(arrStr[3])&&"1".equals(arrStr[13])) {53 context.getCounter("WECHAT_MAPPER_COUNTERS", "R_Deep-Learning").increment(1L);54 }55
56 arrStr = input.split(",")[16].split(";");57 //遍历
58 for(String tool: arrStr){59 //设置key
60 mapOutputKey.set(tool);61 //输出
62 context.write(mapOutputKey, mapOutputValue) ;63 }64 }65 }66
67 /**
68 * Reduce聚合结果69 */
70 private static class ModuleReducer extends Reducer{71 private IntWritable outputValue = newIntWritable() ;72 @Override73 protected void reduce(Text key, Iterablevalues, Context context)74 throwsIOException, InterruptedException {75
76 //定义临时变量,用于累加
77 int sum = 0;78
79 //遍历
80 for(IntWritable value: values){81 sum +=value.get() ;82 }83
84 if(sum < 500){85 //定义500以上的筛选
86 return;87 }88 //设置
89 outputValue.set(sum) ;90 //输出
91 context.write(key, outputValue) ;92
93 }94 }95
96 /**
97 * 驱动创建Job并提交运行 返回状态码98 */
99
100 public int run(String[] args) throwsException {101 //创建一个Job
102 Job job =Job.getInstance(103 this.getConf() , wechat.class.getSimpleName()104 ) ;105 //设置job运行的class
106 job.setJarByClass(wechat.class);107
108 //设置Job109 //1. 设置 input,从哪里读取数据
110 Path inputPath = new Path(args[0]) ;111 FileInputFormat.addInputPath(job, inputPath);112
113 //2. 设置 mapper类
114 job.setMapperClass(ModuleMapper.class);115 //设置map 输出的key和value的数据类型
116 job.setMapOutputKeyClass(Text.class);117 job.setMapOutputValueClass(IntWritable.class);118
119 //3. 设置 reducer 类
120 job.setReducerClass(ModuleReducer.class);121 //设置 reducer 输出的key和value的数据类型
122 job.setOutputKeyClass(Text.class);123 job.setOutputValueClass(IntWritable.class);124 //设置ReduceTask个数125 //job.setNumReduceTasks(2);126
127 //4. 设置 处理结果保存的路径
128 Path outputPath = new Path(args[1]) ;129 FileOutputFormat.setOutputPath(job, outputPath);130
131 //提交job运行
132 boolean isSuccess = job.waitForCompletion(true) ;133
134 //返回状态
135 return isSuccess ? 0 : 1;136 }137
138 /**
139 *140 *@paramargs141 *@throwsException142 */
143 public static void main(String[] args) throwsException {144 if(2 >args.length){145 System.out.println("Usage: " + wechat.class.getSimpleName() +" ");146 return;147 }148
149 //读取HADOOP中配置文件, core-*.xml hdfs-*.xml yarn-*.xml mapred-*.xml
150 Configuration conf = newConfiguration() ;151
152 //运行Job
153 int status = ToolRunner.run(conf, newwechat(), args) ;154
155 //exit program
156 System.exit(status);157 }158
159 }