Hadoop学习-4 Hadoop初试-CSDN博客

本文链接：https://blog.csdn.net/nodie/article/details/83913781

参照官方wordcount示例，统计每个产品的销量
数据：
产品编号    销量
131B    64
3CB2    61
BC1A    41
CCC2    59
ACC2    92
131B     6
3CB2    32
3CB2    36
BC1A    48
ACC2    40
将相同的产品编号的销量统计出来
程序如下：

1，Mapper：

package com.sun.hadoop;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

/**
 * @author sunjun
 * @create 2010-7-1 下午10:26:17
 */
public class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

    protected void map(LongWritable key, Text value, Context context)
            throws IOException, InterruptedException {
        String line = value.toString();
        if (!(line == null || "".equals(line))) {
            String[] array = line.split("\\t");
            if (array != null && array.length == 2) {
                context.write(new Text(array[0].trim()), new IntWritable(
                        Integer.parseInt(array[1].trim())));
            }
        }
    }

}

2，Reducer：

package com.sun.hadoop;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

/**
 * @author sunjun
 * @create 2010-7-3 下午01:02:53
 */
public class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable> {

    protected void reduce(Text key, Iterable<IntWritable> values,
            Context context) throws IOException, InterruptedException {
        int sum = 0;
        for (IntWritable value : values) {
            sum += value.get();
        }
        context.write(key, new IntWritable(sum));
    }

}

3，生成测试数据：

/*
* 生成测试数据，存放在C:\cygwin\home\Administrator\hadoop-0.20.2\test-data目录下，生成5个.txt文件
*/
package com.sun.hadoop;

import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStream;
import java.util.Random;

/**
 * @author sunjun
 * @create 2010-7-3 下午01:49:15
 */
public class GenerateDate {

    private static char[] chars = "ABC123".toCharArray();

    /**
     * @param args
     */
    public static void main(String[] args) {
        for (int i = 1; i < 6; i++)
            generate(i + ".txt");
        System.out.println("over.");
    }

    /**
     * @param fileName
     */
    private static void generate(String fileName) {
        StringBuilder str = new StringBuilder();
        int len = chars.length;
        int count = 10000;
        for (int i = 0; i < count; i++) {
            Random random = new Random();
            for (int j = 0; j < 4; j++)
                str.append(chars[random.nextInt(len)]);
            str.append("\t").append(random.nextInt(100));
            if (i < count - 1)
                str.append("\n");
        }
        try {
            OutputStream output = new FileOutputStream(new File(
                    "C:\\cygwin\\home\\Administrator\\hadoop-0.20.2\\test-data\\"
                            + fileName));
            output.write(str.toString().getBytes());
            output.flush();
            output.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

4，测试代码：

/*
* 该程序运行需要传递两个参数：输入目录，输出目录
*/
package com.sun.hadoop;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

/**
 * @author sunjun
 * @create 2010-7-3 下午01:08:07
 */
public class TestMy {

    public static void main(String[] args) {
        // if (args.length < 2)
        // throw new NullPointerException("args is not 2 params.");

        Configuration config = new Configuration();
        String[] otherArgs = new GenericOptionsParser(config, args)
                .getRemainingArgs();
        if (otherArgs.length != 2) {
            System.err.println("Usage: wordcount <in> <out>");
            System.exit(2);
        }

        try {
            Job job = new Job(config, "sale total");
            job.setJarByClass(TestMy.class);

            job.setMapperClass(MyMapper.class);
            job.setReducerClass(MyReducer.class);

            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class);

            FileInputFormat.addInputPath(job, new Path(args[0]));
            FileOutputFormat.setOutputPath(job, new Path(args[1]));

            System.exit(job.waitForCompletion(true) ? 0 : 1);
        } catch (IOException e) {
            e.printStackTrace();
        } catch (InterruptedException e) {
            e.printStackTrace();
        } catch (ClassNotFoundException e) {
            e.printStackTrace();
        }

    }

}

在hadoop上运行：
1，在独立模式的hadoop上运行
先确保配置文件是配置的独立模式，而不是伪分布模式
将编译的classes文件复制到C:\cygwin\home\Administrator\hadoop-0.20.2

运行：

也可直接在C:\cygwin\home\Administrator\hadoop-0.20.2\conf\hadoop-env.sh中将HADOOP_CLASSPATH设置好，再运行
test-data test-out都是在C:\cygwin\home\Administrator\hadoop-0.20.2目录下

2，在伪分布模式的hadoop上运行
先确保配置文件是配置的伪分布模式，而不是独立模式