不知道怎么回事,刚才写好的文章可能字数太多?还是图片太多?导致我文章的后半部分没有了。所以接着写第二篇吧。
从web页面中查看程序执行结果如图6.8所示。
可以从图6.8中看到,现在显示的是HDFS中路径为:File:/user/hadoop/ KDDCUP_OUTPUT/ part-00000的文件。文件内容在图6.8的下方,文件中每一条的记录正是WordSearch程序在500万条记录中经过检索后得出的结果。输出的格式也正是上文中已经提到过的<[filename::offset], [searchWord]::[line]>。
/***************************************************
* WordSearch V2.0
* 采用旧的Hadoop API
* by think_cxf 2011-04
* Input:<Offset,line>
* Output: <[filename]::[offset], [searchWord]::[line]>
* ***************************************************/
import Java.io.*;
import Java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
public class WordSearch extends Configured implements Tool
{
/********************************
* Map:
* Input: <offset,line>
* Output: <[filename::Offset], [searchWord]::[line]>
*********************************/
public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, Text>
{
private Text mapKey = new Text(); //output in map function [key]
private Text mapValue = new Text(); //output in map function [value]
private String pattern = "[^//w]";
//正则表达式,代表不是0-9, a-z, A-Z的所有其它字符
private String sWord;
//需要查找的单词,用于传递需要搜索的单词给Reduce函数
private String temp;
private String FileName; //文件名
private JobConf conf;
public void configure(JobConf conf) {
this.conf = conf;
}
public void map(LongWritable key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException
{
String line = value.toString().toLowerCase(); //WORD --> word
line = line.replaceAll(pattern, " "); //将非0-9, a-z, A-Z的字符替换为空格
FileName = conf.get("map.input.file");//get the current file name
sWord = conf.get("searchWord");//get the search word
//int Offset = (int)key.get();
//IntWritable lineOffset = new IntWritable(Offset);//Offset
System.out.println("****sWord is:"+sWord+".****The Line is:"+line+'.');
System.out.println("FileName is:"+FileName+'.'+"/nThe Line is:"+line+'.');
StringTokenizer itr= new StringTokenizer(line);
while (itr.hasMoreTokens())
{
temp=itr.nextToken();
System.out.println("Temp word is:"+temp+'.');
if( temp.compareTo(sWord) == 0)
//比较两个单词,如果匹配,则写入键值对
{
System.out.println("----Find one!----");
mapKey.set('['+ FileName.toString() + "]::["+ key.toString() + "]:" );
// become [filename]::[Offset]:
mapValue.set('['+sWord.toString() + "] [" + line.toString() + "]");
// become [searchWord] [line]
output.collect(mapKey,mapValue);//产生<word,行号>这样的键值对
}//if end
}//while end
}//public map end
}// class Map end
/************************************
* Reduce Function
* Input:<[filename::searchWord], [[searchWord] [line]1,[searchWord] [line]2,
*[searchWord] [line]3,...,[searchWord] [line]n]>
* Output:<[filename::searchWord],Offset>
**************************************/
public static class Reduce extends MapReduceBase implements Reducer<Text, Text, Text, Text>
{
public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException
{
while (values.hasNext()) {
output.collect(key,values.next());
} //while end
} //public reduce end
} //public class Reduce end
public int run(String[] args) throws Exception
{
JobConf conf = new JobConf(getConf(), WordSearch.class);
conf.setJobName("wordsearch"); //set job name
conf.set("searchWord", args[2]); //send the "searchWord" to the system
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(Text.class);
conf.setMapperClass(Map.class);
conf.setCombinerClass(Reduce.class);
conf.setReducerClass(Reduce.class);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(conf, new Path(args[0]) );
FileOutputFormat.setOutputPath(conf, new Path(args[1]) );
//setSearchWord(args[2]);
JobClient.runJob(conf);
return 0;
}
public static void main(String[] args) throws Exception
{
int exitCode = ToolRunner.run(new Configuration(), new WordSearch(), args);
System.exit(exitCode);
}
}