Hadoop MR Java 代码,统计结果输出到日志文件中
package vitamin.user_static_table;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Mapper;
public class GetUserStaticMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
private LongWritable out = new LongWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString().trim();
String tks[] = line.split("\t");
if (tks.length < 5) {
return;
}
Counter number1 = context.getCounter("NULL", "all");
number1.increment(1L);
context.write(new Text("uid_"+tks[1]+"_"+tks[0]), out);
if (!(tks[2].equals("_")||tks[3].equals("_"))) {
Counter number2 = context.getCounter("NULL", "c_"+tks[1]);
number2.increment(1L);
if (tks[2].equals("1")) {
Counter number3 = context.getCounter("NULL", "sex_c_"+tks[1]);
number3.increment(1L);
}
if (tks[3].equals("1")) {
Counter number4 = context.getCounter("NULL", "age_c_"+tks[1]);
number4.increment(1L);
}
}
if (tks[1].equals("1")&& !tks[4].equals("_")&&!tks[4].equals("0")) {
Counter number5 = context.getCounter("NULL", "tag_c1_sum");
number5.increment(1L);
Counter number6 = context.getCounter("NULL", "tag_c_1");
number6.increment(Integer.parseInt(tks[4])*1L);
}
}
}
统计结果直接输出到log文件中,不进入reducer里,输入如下(269-277为输出统计量):
426 17/08/09 09:05:38 INFO mapreduce.Job: Job job_1493284708000_147282 completed successfully
427 17/08/09 09:05:38 INFO mapreduce.Job: Counters: 53
428 File System Counters
429 FILE: Number of bytes read=768
430 FILE: Number of bytes written=28933956
431 FILE: Number of read operations=0
432 FILE: Number of large read operations=0
433 FILE: Number of write operations=0
434 HDFS: Number of bytes read=374470086
435 HDFS: Number of bytes written=0
436 HDFS: Number of read operations=684
437 HDFS: Number of large read operations=0
438 HDFS: Number of write operations=256
439 Job Counters
440 Killed map tasks=2
441 Launched map tasks=102
442 Launched reduce tasks=128
443 Data-local map tasks=14
444 Rack-local map tasks=88
445 Total time spent by all maps in occupied slots (ms)=3509175
446 Total time spent by all reduces in occupied slots (ms)=9734817
447 Total time spent by all map tasks (ms)=701835
448 Total time spent by all reduce tasks (ms)=3244939
449 Total vcore-seconds taken by all map tasks=701835
450 Total vcore-seconds taken by all reduce tasks=3244939
451 Total megabyte-seconds taken by all map tasks=3593395200
452 Total megabyte-seconds taken by all reduce tasks=9968452608
453 Map-Reduce Framework
454 Map input records=13186141
455 Map output records=0
456 Map output bytes=0
457 Map output materialized bytes=76800
458 Input split bytes=14800
459 Combine input records=0
460 Combine output records=0
461 Reduce input groups=0
462 Reduce shuffle bytes=76800
463 Reduce input records=0
466 Shuffled Maps =12800
467 Failed Shuffles=0
470 CPU time spent (ms)=546840
471 Physical memory (bytes) snapshot=137153257472
251 Total time spent by all maps in occupied slots (ms)=7766444
252 Total time spent by all reduces in occupied slots (ms)=0
253 Total time spent by all map tasks (ms)=3883222
254 Total vcore-seconds taken by all map tasks=3883222
255 Total megabyte-seconds taken by all map tasks=5964628992
258 Map output records=21091153
261 Failed Shuffles=0
262 Merged Map outputs=0
263 GC time elapsed (ms)=15595
264 CPU time spent (ms)=1460320
265 Physical memory (bytes) snapshot=168560390144
266 Virtual memory (bytes) snapshot=951342964736
267 Total committed heap usage (bytes)=468841398272
268 NULL
269 age_c_0=108143
270 age_c_1=7596379
271 all=21091153
272 c_0=1258386
273 c_1=19832601
274 sex_c_0=1138055
275 sex_c_1=18447951
276 tag_c1_sum=19175427
277 tag_c_1=6952428947
278 File Input Format Counters
279 Bytes Read=244379652
280 File Output Format Counters
281 Bytes Written=205508096
282 Job2 done...
283 All Jobs Finished !
hadoop stream python 里用错误输出统计
12 import sys,hashlib,struct,os
14
18
19 if __name__=="__main__":
20 for line in sys.stdin:
21 line = line.strip()
22 if 'uid_0' in line:
23 print >> sys.stderr, "reporter:counter:group,keep_0,1"
24 #print 'keep_0'+'\t'+'1'
25 elif 'uid_1' in line:
26 #print 'keep_1'+'\t'+'1'
27 print >> sys.stderr, "reporter:counter:group,keep_1,1"
上例中hadoop任务不需要reducer,输出如下:(keep_0, keep_1为统计量)
425 17/08/09 09:05:25 INFO mapreduce.Job: map 100% reduce 100%
426 17/08/09 09:05:38 INFO mapreduce.Job: Job job_1493284708000_147282 completed successfully
427 17/08/09 09:05:38 INFO mapreduce.Job: Counters: 53
428 File System Counters
429 FILE: Number of bytes read=768
430 FILE: Number of bytes written=28933956
431 FILE: Number of read operations=0
432 FILE: Number of large read operations=0
433 FILE: Number of write operations=0
434 HDFS: Number of bytes read=374470086
435 HDFS: Number of bytes written=0
436 HDFS: Number of read operations=684
437 HDFS: Number of large read operations=0
438 HDFS: Number of write operations=256
439 Job Counters
440 Killed map tasks=2
441 Launched map tasks=102
442 Launched reduce tasks=128
443 Data-local map tasks=14
444 Rack-local map tasks=88
445 Total time spent by all maps in occupied slots (ms)=3509175
446 Total time spent by all reduces in occupied slots (ms)=9734817
447 Total time spent by all map tasks (ms)=701835
448 Total time spent by all reduce tasks (ms)=3244939
449 Total vcore-seconds taken by all map tasks=701835
450 Total vcore-seconds taken by all reduce tasks=3244939
451 Total megabyte-seconds taken by all map tasks=3593395200
452 Total megabyte-seconds taken by all reduce tasks=9968452608
453 Map-Reduce Framework
454 Map input records=13186141
455 Map output records=0
456 Map output bytes=0
457 Map output materialized bytes=76800
458 Input split bytes=14800
459 Combine input records=0
460 Combine output records=0
461 Reduce input groups=0
462 Reduce shuffle bytes=76800
463 Reduce input records=0
466 Shuffled Maps =12800
467 Failed Shuffles=0
470 CPU time spent (ms)=546840
471 Physical memory (bytes) snapshot=137153257472
472 Virtual memory (bytes) snapshot=656820338688
473 Total committed heap usage (bytes)=335173124096
474 Shuffle Errors
475 BAD_ID=0
476 CONNECTION=0
477 IO_ERROR=0
478 WRONG_LENGTH=0
479 WRONG_MAP=0
480 WRONG_REDUCE=0
481 group
482 keep_0=149982
483 keep_1=13036159
484 File Input Format Counters
485 Bytes Read=374455286
486 File Output Format Counters
487 Bytes Written=0