使用MapReduce程序对KDD Cup 99数据集进行信息检索(二)

最新推荐文章于 2024-04-30 13:32:57 发布

cxfeng

最新推荐文章于 2024-04-30 13:32:57 发布

阅读量1.3k

点赞数 1

分类专栏： Hadoop/MapReduce 文章标签： mapreduce string import output exception class

本文链接：https://blog.csdn.net/think_cxf/article/details/6535894

版权

Hadoop/MapReduce 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

这篇博客介绍了如何使用Hadoop MapReduce程序WordSearch V2.0对KDD Cup 99数据集进行信息检索。程序通过Map和Reduce阶段，查找指定单词并输出结果，格式为<[filename::offset], [searchWord]::[line]>。博客展示了程序的执行结果和关键代码片段。" 108858590,10130757,C++实现：判断正常血压,"['C++', '算法', '数据处理']

摘要由CSDN通过智能技术生成

不知道怎么回事，刚才写好的文章可能字数太多？还是图片太多？导致我文章的后半部分没有了。所以接着写第二篇吧。

从web页面中查看程序执行结果如图6.8所示。

可以从图6.8中看到，现在显示的是HDFS中路径为：File:/user/hadoop/ KDDCUP_OUTPUT/ part-00000的文件。文件内容在图6.8的下方，文件中每一条的记录正是WordSearch程序在500万条记录中经过检索后得出的结果。输出的格式也正是上文中已经提到过的<[filename::offset], [searchWord]::[line]>。

/***************************************************

* WordSearch V2.0

* 采用旧的Hadoop API

* by think_cxf 2011-04

* Input:<Offset,line>

* Output: <[filename]::[offset], [searchWord]::[line]>

* ***************************************************/

import Java.io.*;

import Java.util.*;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.conf.*;

import org.apache.hadoop.io.*;

import org.apache.hadoop.mapred.*;

import org.apache.hadoop.util.*;

public class WordSearch extends Configured implements Tool

{

/********************************

* Map:

* Input: <offset,line>

* Output: <[filename::Offset], [searchWord]::[line]>

*********************************/

public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, Text>

{

private Text mapKey = new Text(); //output in map function [key]

private Text mapValue = new Text(); //output in map function [value]

private String pattern = "[^//w]";

//正则表达式，代表不是0-9, a-z, A-Z的所有其它字符

private String sWord;

//需要查找的单词，用于传递需要搜索的单词给Reduce函数

private String temp;

private String FileName; //文件名

private JobConf conf;

public void configure(JobConf conf) {

this.conf = conf;

}

public void map(LongWritable key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException

{

String line = value.toString().toLowerCase(); //WORD --> word

line = line.replaceAll(pattern, " "); //将非0-9, a-z, A-Z的字符替换为空格

FileName = conf.get("map.input.file");//get the current file name

sWord = conf.get("searchWord");//get the search word

//int Offset = (int)key.get();

//IntWritable lineOffset = new IntWritable(Offset);//Offset

System.out.println("****sWord is:"+sWord+".****The Line is:"+line+'.');

System.out.println("FileName is:"+FileName+'.'+"/nThe Line is:"+line+'.');

StringTokenizer itr= new StringTokenizer(line);

while (itr.hasMoreTokens())

{

temp=itr.nextToken();

System.out.println("Temp word is:"+temp+'.');

if( temp.compareTo(sWord) == 0)

//比较两个单词,如果匹配，则写入键值对

{

System.out.println("----Find one!----");

mapKey.set('['+ FileName.toString() + "]::["+ key.toString() + "]:" );

// become [filename]::[Offset]:

mapValue.set('['+sWord.toString() + "] [" + line.toString() + "]");

// become [searchWord] [line]

output.collect(mapKey,mapValue);//产生<word,行号>这样的键值对

}//if end

}//while end

}//public map end

}// class Map end

/************************************

* Reduce Function

* Input:<[filename::searchWord], [[searchWord] [line]1,[searchWord] [line]2,

*[searchWord] [line]3,...,[searchWord] [line]n]>

* Output:<[filename::searchWord],Offset>

**************************************/

public static class Reduce extends MapReduceBase implements Reducer<Text, Text, Text, Text>

{

public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException

{

while (values.hasNext()) {

output.collect(key,values.next());

} //while end

} //public reduce end

} //public class Reduce end

public int run(String[] args) throws Exception

{

JobConf conf = new JobConf(getConf(), WordSearch.class);

conf.setJobName("wordsearch"); //set job name

conf.set("searchWord", args[2]); //send the "searchWord" to the system

conf.setOutputKeyClass(Text.class);

conf.setOutputValueClass(Text.class);

conf.setMapperClass(Map.class);

conf.setCombinerClass(Reduce.class);

conf.setReducerClass(Reduce.class);

conf.setInputFormat(TextInputFormat.class);

conf.setOutputFormat(TextOutputFormat.class);

FileInputFormat.setInputPaths(conf, new Path(args[0]) );

FileOutputFormat.setOutputPath(conf, new Path(args[1]) );

//setSearchWord(args[2]);

JobClient.runJob(conf);

return 0;

}

public static void main(String[] args) throws Exception

{

int exitCode = ToolRunner.run(new Configuration(), new WordSearch(), args);

System.exit(exitCode);

}

cxfeng

关注

1
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
使用MapReduce程序对KDD Cup 99数据集进行信息检索(二)

不知道怎么回事，刚才写好的文章可能字数太多？还是图片太多？导致我文章的后半部分没有了。所以接着写第二篇吧。从web页面中查看程序执行结果如图6.8所示。可以从图6.8中看到，现在显示的是HDFS中路径为：File:/user/hadoop/ KDDCUP_OUTPUT/ part-00000的文件。文件内容在图6.8的下方，文件中每一条的记录正是WordSearch程序在5
复制链接

扫一扫