参照官方wordcount示例,统计每个产品的销量
数据:
产品编号 销量
131B 64
3CB2 61
BC1A 41
CCC2 59
ACC2 92
131B 6
3CB2 32
3CB2 36
BC1A 48
ACC2 40
将相同的产品编号的销量统计出来
程序如下:
1,Mapper:
package com.sun.hadoop;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
/**
* @author sunjun
* @create 2010-7-1 下午10:26:17
*/
public class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
if (!(line == null || "".equals(line))) {
String[] array = line.split("\\t");
if (array != null && array.length == 2) {
context.write(new Text(array[0].trim()), new IntWritable(
Integer.parseInt(array[1].trim())));
}
}
}
}
2,Reducer:
package com.sun.hadoop;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
/**
* @author sunjun
* @create 2010-7-3 下午01:02:53
*/
public class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
protected void reduce(Text key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable value : values) {
sum += value.get();
}
context.write(key, new IntWritable(sum));
}
}
3,生成测试数据:
/*
* 生成测试数据,存放在C:\cygwin\home\Administrator\hadoop-0.20.2\test-data目录下,生成5个.txt文件
*/
package com.sun.hadoop;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStream;
import java.util.Random;
/**
* @author sunjun
* @create 2010-7-3 下午01:49:15
*/
public class GenerateDate {
private static char[] chars = "ABC123".toCharArray();
/**
* @param args
*/
public static void main(String[] args) {
for (int i = 1; i < 6; i++)
generate(i + ".txt");
System.out.println("over.");
}
/**
* @param fileName
*/
private static void generate(String fileName) {
StringBuilder str = new StringBuilder();
int len = chars.length;
int count = 10000;
for (int i = 0; i < count; i++) {
Random random = new Random();
for (int j = 0; j < 4; j++)
str.append(chars[random.nextInt(len)]);
str.append("\t").append(random.nextInt(100));
if (i < count - 1)
str.append("\n");
}
try {
OutputStream output = new FileOutputStream(new File(
"C:\\cygwin\\home\\Administrator\\hadoop-0.20.2\\test-data\\"
+ fileName));
output.write(str.toString().getBytes());
output.flush();
output.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
4,测试代码:
/*
* 该程序运行需要传递两个参数:输入目录,输出目录
*/
package com.sun.hadoop;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
/**
* @author sunjun
* @create 2010-7-3 下午01:08:07
*/
public class TestMy {
public static void main(String[] args) {
// if (args.length < 2)
// throw new NullPointerException("args is not 2 params.");
Configuration config = new Configuration();
String[] otherArgs = new GenericOptionsParser(config, args)
.getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: wordcount <in> <out>");
System.exit(2);
}
try {
Job job = new Job(config, "sale total");
job.setJarByClass(TestMy.class);
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
} catch (ClassNotFoundException e) {
e.printStackTrace();
}
}
}
在hadoop上运行:
1,在独立模式的hadoop上运行
先确保配置文件是配置的独立模式,而不是伪分布模式
将编译的classes文件复制到C:\cygwin\home\Administrator\hadoop-0.20.2
运行:
test-data test-out都是在C:\cygwin\home\Administrator\hadoop-0.20.2目录下
2,在伪分布模式的hadoop上运行
先确保配置文件是配置的伪分布模式,而不是独立模式
运行结果: