关于wordCount
推荐这个文章,非常清楚->链接
思路(不太理解就看一下链接的文章)
1.原来的代码是逐行读取,然后合并相同单词,再按顺序输出每个个数,由于我们不知道哪个单词是结尾,所有我们可以在每一行后面添加一个标识符来表示读取结束,这样就可以使程序在读到标识符后结束。
比如我们用“完”来表示,将它的值设为-1(这样和是负数就表示结束):
while (token.hasMoreTokens()) {
word.set(token.nextToken());
context.write(word, one);
}
Text w=new Text(“完”);
int last=-1;
context.write(w, new IntWritable(last));
那么当我们这样改了之后,会多出一个“完”,并且它的值是负数。
2.然后我们可以设置一个全局变量来统计总数
public class wordcount {
public static int sum2=0;
3.既然是统计全部个数,也就是多于一个的都算作一个,直到作为标识符的负数,输出统计值,因为“完”在每一行都有一个,每次-1,所以也能统计有多少行
for (IntWritable val : values) {
//sum += val.get();
if(val.get()<0){
Text z=new Text(“总数:”);
context.write(new Text(“行数:”), new IntWritable(-sum));
context.write(z, new IntWritable(sum2));
}
sum2++;
}
完整代码
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.lib.reduce.IntSumReducer;
import org.apache.hadoop.util.GenericOptionsParser;
public class wordcount {
public static int sum2=0;
// 自定义的mapper,继承org.apache.hadoop.mapreduce.Mapper
public static class WordCountMap extends Mapper<LongWritable, Text, Text, IntWritable> {
private final IntWritable one = new IntWritable(1);
private Text word = new Text();
// Mapper<LongWritable, Text, Text, LongWritable>.Context context
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
System.out.println(line);
// split 函数是用于按指定字符(串)或正则去分割某个字符串,结果以字符串数组形式返回,这里按照“\t”来分割text文件中字符,即一个制表符
// ,这就是为什么我在文本中用了空格分割,导致最后的结果有很大的出入
StringTokenizer token = new StringTokenizer(line);
while (token.hasMoreTokens()) {
word.set(token.nextToken());
context.write(word, one);
}
Text w=new Text(“完”);
int last=-1;
context.write(w, new IntWritable(last));
}
}
// 自定义的reducer,继承org.apache.hadoop.mapreduce.Reducer
public static class WordCountReduce extends Reducer<Text, IntWritable, Text, IntWritable> {
// Reducer<Text, LongWritable, Text, LongWritable>.Context context
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
System.out.println(key);
System.out.println(values);
//int sum = 0;
for (IntWritable val : values) {
//sum += val.get();
if(val.get()<0){
Text z=new Text(“总数:”);
context.write(new Text(“行数:”), new IntWritable(-sum));
context.write(z, new IntWritable(sum2));
}
sum2++;
}
//context.write(key, new IntWritable(sum));
}
}
// 客户端代码,写完交给ResourceManager框架去执行
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf,"word count");
// 打成jar执行
job.setJarByClass(wordcount.class);
// 数据在哪里?
FileInputFormat.addInputPath(job, new Path(args[0]));
// 使用哪个mapper处理输入的数据?
job.setMapperClass(WordCountMap.class);
// map输出的数据类型是什么?
//job.setMapOutputKeyClass(Text.class);
//job.setMapOutputValueClass(LongWritable.class);
job.setCombinerClass(IntSumReducer.class);
// 使用哪个reducer处理输入的数据
job.setReducerClass(WordCountReduce.class);
// reduce输出的数据类型是什么?
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// job.setInputFormatClass(TextInputFormat.class);
// job.setOutputFormatClass(TextOutputFormat.class);
// 数据输出到哪里?
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// 交给yarn去执行,直到执行结束才退出本程序
job.waitForCompletion(true);
/*
String[] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs();
if(otherArgs.length<2){
System.out.println("Usage:wordcount <in> [<in>...] <out>");
System.exit(2);
}
for(int i=0;i<otherArgs.length-1;i++){
FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
}
System.exit(job.waitForCompletion(tr0ue)?0:1);
*/
}
}
以上是我的想法,如果你有不同的想法,欢迎评论