Hadoop WordCount（Streaming，Python，Java三合一）

最新推荐文章于 2022-04-23 18:33:01 发布

白石江边

最新推荐文章于 2022-04-23 18:33:01 发布

阅读量706

点赞数

分类专栏： Hadoop

本文链接：https://blog.csdn.net/baishijiangbian/article/details/52890534

版权

Hadoop 专栏收录该内容

4 篇文章 0 订阅

订阅专栏

一、Steaming

Map任务：

#!/bin/bash
awk 'BEGIN{
FS = "[ ,. ]"
OFS = "\t"
}{
for( i = 1; i <= NF; ++i)
{
dict[$i] += 1
}
}END{
for( key in dict)
{
print key,dict[key]
}
}'

Reducer任务：

#!/bin/bash
awk 'BEGIN{
OFS = "\t"
}{
dict[$1] += $2
}END{
for(key in dict)
{
print key,dict[key]
}
}'

启动脚本：

#!/bin/bash

hadoop fs -rm -r /data/apps/zhangwenchao/mapreduce/streaming/wordcount/output

hadoop jar /data/tools/hadoop/hadoop-2.6.2/share/hadoop/tools/lib/hadoop-streaming-2.6.2.jar \
-input /data/apps/zhangwenchao/mapreduce/streaming/wordcount/input \
-output /data/apps/zhangwenchao/mapreduce/streaming/wordcount/output \
-mapper "sh -x mapper.sh" \
-reducer "sh -x reducer.sh" \
-file mapper.sh \
-file reducer.sh \
-jobconf mapred.job.name=wordcount \
-jobconf mapred.job.tasks=5 \
-jobconf mapred.reduce.tasks=3

二、Python

Map任务：

#! /usr/bin/python
import sys
import re
for line in sys.stdin:
wordlist=re.split('[;,.?]',line)
for words in wordlist:
words=words.strip()
tmp = words.split()
for item in tmp:
print "%s\t%s" % (item, 1)

Reducer任务：

#!/usr/bin/env python
from operator import itemgetter
import sys
current_word = None
current_count = 0
word = None
for line in sys.stdin:
line = line.strip()
word, count = line.split('\t', 1)
try:
count = int(count)
except ValueError:
continue
if current_word == word:
current_count += count
else:
if current_word:
print '%s\t%s' % (current_word,current_count)
current_count = count
current_word = word
if word == current_word:
print "%s\t%s" % (current_word, current_count)

启动脚本：

#!/bin/bash

hadoop fs -rm -r /data/apps/zhangwenchao/mapreduce/python/wordcount/output

hadoop jar /data/tools/hadoop/hadoop-2.6.2/share/hadoop/tools/lib/hadoop-streaming-2.6.2.jar \
-input /data/apps/zhangwenchao/mapreduce/python/wordcount/input \
-output /data/apps/zhangwenchao/mapreduce/python/wordcount/output \
-mapper "mapper.py" \
-reducer "reducer.py" \
-file mapper.py \
-file reducer.py \
-jobconf mapred.job.name=wordcount \
-jobconf mapred.job.tasks=5 \
-jobconf mapred.reduce.tasks=3

三、Java

Map：

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class MyMap extends Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();

public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());

context.write(word, one);
}
}
}

Reduce：

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class MyReduce extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
context.write(key, new IntWritable(sum));
}
}

Main：

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class Main {
public static void main(String[] args) throws Exception {

String input = "hdfs://test1:8020/test/**/test/zhangwenchao/java/wordcount/intput";
String output = "hdfs://test1:8020/test/**/test/zhangwenchao/java/wordcount/output";
Configuration conf = new Configuration();

Job job = new Job(conf);
job.setJobName("test4");
job.setJarByClass(Main.class);

FileInputFormat.addInputPath(job, new Path(input));
FileOutputFormat.setOutputPath(job, new Path(output));

job.setMapperClass(MyMap.class);
job.setReducerClass(MyReduce.class);

job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);

job.setCombinerClass(MyReduce.class);

job.setNumReduceTasks(3);

job.waitForCompletion(true);
}
}