实验内容与要求
1. 在Eclipse环境下编写WordCount程序,统计所有除Stop-Word(如a, an, of, in, on, the, this, that,…)外出现次数k次以上的单词计
数,最后的结果按照词频从高到低排序输出;
2. 在集群上运行程序,对莎士比亚文集文档数据进行处理;
3. 可自行建立一个Stop-Word列表文件,其中包含部分停词即可,不需要列出全部停词;参数k作为输入参数动态指定(如k=10)
代码:
import java.io.*;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.mapred.lib.InverseMapper;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.*;
import org.apache.hadoop.fs.FileSystem;
public class WordCount extends Configured implements Tool {
// Map类
public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {
static enum Counters { INPUT_WORDS }
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
private Set<String> patternsToSkip = new HashSet<String>();
private long numRecords = 0;
private String inputFile;
private String otherPattern = "[^//w]"; //正则表达式,代表不是0-9, a-z, A-Z的所有其它字符
/**
* 覆盖configure方法
* */
public void configure(JobConf job) {
inputFile = job.get("map.input.file");
if (job.getBoolean("wordcount.skip.patterns", false)) {
Path[] patternsFiles = new Path[0];
try {
patternsFiles = DistributedCache.getLocalCacheFiles(job);
} catch (IOException ioe) {
System.err.println("Caught exception while getting cached files: " + StringUtils.stringifyException(ioe));
}
for (Path patternsFile : patternsFiles) {
parseSkipFile(patternsFile);
}
}
}
/**
* 获取停词文件,将停词加入停词列表中
* */
private void parseSkipFile(Path patternsFile) {
try {
BufferedReader fis = new BufferedReader(new FileReader(patternsFile.toString()));
String pattern = null;
while ((pattern = fis.readLine()) != null) {
patternsToSkip.add(pattern);
}
} catch (IOException ioe) {
System.err.println("Caught exception while parsing the cached file '" + patternsFile + "' : " + StringUtils.stringifyException(ioe));
}
}
public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
String line = value.toString();
line = line.replaceAll(otherPattern, " "); //将非0-9, a-z, A-Z的字符替换为空格
for (String pattern : patternsToSkip) {
line = line.replaceAll(pattern, "");
}
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());
output.collect(word, one);
reporter.incrCounter(Counters.INPUT_WORDS, 1);
}
if ((++numRecords % 100) == 0) {
reporter.setStatus("Finished processing " + numRecords + " records " + "from the input file: " + inputFile);
}
}
}
// Reduce类
public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {
private int k;
/**
* 覆盖configure方法
* */
public void configure(JobConf job) {
k = job.getInt("k", 0);
}
public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
int sum = 0;
while (values.hasNext()) {
sum += values.next().get();
}
if(sum > k)
output.collect(key, new IntWritable(sum));
}
}
private static class IntWritableDecreasingComparator extends IntWritable.Comparator {
public int compare(WritableComparable a, WritableComparable b) {
return -super.compare(a, b);
}
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
return -super.compare(b1, s1, l1, b2, s2, l2);
}
}
// run方法
public int run(String[] args) throws Exception {
// 新建一个任务,用来完成对文集中单词的词频统计、词频阈值过滤和停词处理
JobConf conf = new JobConf(getConf(), WordCount.class);
conf.setJobName("word count");
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(IntWritable.class);
conf.setMapperClass(Map.class);
conf.setReducerClass(Reduce.class);
conf.setInputFormat(TextInputFormat.class);
List<String> other_args = new ArrayList<String>();
for (int i=0; i < args.length; ++i) {
if ("-skip".equals(args[i])) {
DistributedCache.addCacheFile(new Path(args[++i]).toUri(), conf);
conf.setBoolean("wordcount.skip.patterns", true);
} else {
other_args.add(args[i]);
}
}
FileInputFormat.setInputPaths(conf, new Path(other_args.get(0)));;
Path tempDir = new Path("WordCount_temp" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); //定义一个临时目录
FileOutputFormat.setOutputPath(conf, tempDir); //先将词频统计任务的输出结果写到临时目录中, 下一个排序任务以临时目录为输入目录。
conf.setOutputFormat(SequenceFileOutputFormat.class);
JobClient.runJob(conf);
// 排序任务:根据每个单词出现的词频从高到低进行排序
JobConf sortJob = new JobConf(getConf(), WordCount.class);
sortJob.setJobName("sort");
sortJob.setJarByClass(WordCount.class);
FileInputFormat.setInputPaths(sortJob, tempDir);
sortJob.setInputFormat(SequenceFileInputFormat.class);
/*InverseMapper由hadoop库提供,作用是实现map()之后的数据对的key和value交换*/
sortJob.setMapperClass(InverseMapper.class);
/*将 Reducer 的个数限定为1, 最终输出的结果文件就是一个。*/
sortJob.setNumReduceTasks(1);
FileOutputFormat.setOutputPath(sortJob, new Path(other_args.get(1)));
sortJob.setOutputFormat(TextOutputFormat.class);
sortJob.setOutputKeyClass(IntWritable.class);
sortJob.setOutputValueClass(Text.class);
/*Hadoop 默认对 IntWritable 按升序排序,而我们需要的是按降序排列。
* 因此我们实现了一个 IntWritableDecreasingComparator 类,
* 并指定使用这个自定义的 Comparator 类对输出结果中的 key (词频)进行排序*/
sortJob.setOutputKeyComparatorClass(IntWritableDecreasingComparator.class);
JobClient.runJob(sortJob);
FileSystem.get(conf).delete(tempDir); //删除临时目录
return 0;
}
// main方法
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new WordCount(), args);
System.exit(res);
}
}
在Eclipse中导成jar包输出,包名为WordCount.jar
运行:
hadoop namenode -format
start-all.sh
jps
hadoop dfs -mkdir test-input
hadoop dfs -copyFromLocal XXX文件夹 test-input
hadoop dfs -copyFromLocal patterns.txt(停词文件) tempDir
hadoop jar WordCount.jar -D k=10(参数,可调整) test-input/XXX文件夹名称 test-output -skip tempDir/patterns.txt
hadoop dfs -get test-output/part-00000 output.txt
参考资料:
http://blog.chinaunix.net/space.php?uid=20761674&do=blog&cuid=2157576
http://blog.csdn.net/xw13106209/archive/2011/01/07/6122719.aspx
本文为笔者备忘所用,放在网上供大家参考。请勿直接抄袭,谢谢!