倒排索引和利用PageRank算法进行网页排序

最新推荐文章于 2024-06-09 19:51:04 发布

如是Rushy

最新推荐文章于 2024-06-09 19:51:04 发布

阅读量2k

点赞数 5

分类专栏：数据科学原理文章标签： map hadoop

本文链接：https://blog.csdn.net/weixin_44686879/article/details/111571788

版权

数据科学原理专栏收录该内容

3 篇文章 9 订阅

订阅专栏

第三章倒排索引

前面通过词频统计，已经可以找出高频率的“关键词”了，这些词汇出现的频率很高以至于很难直接对其所在的文档进行查找。必须借助一定的关系模型表示单词与文本的关系，然后才可以实现快速搜索查找。

在这里插入图片描述

单词-文档矩阵是表达这种包含关系的最简洁的概念模型。每列代表文档包含了哪些单词，比如文档1包含了词汇1和词汇4，而不包含其它单词。每行代表了哪些文档包含了某个单词。比如对于词汇1来说，文档1和文档4中出现过单词1，而其它文档不包含词汇1。

搜索引擎的索引其实就是实现“单词-文档矩阵”的具体数据结构。可以有不同的方式来实现上述概念模型，比如“倒排索引”、“签名文件”、“后缀树”等方式。但是各项实验数据表明，“倒排索引”是实现单词到文档映射关系的最佳实现方式

本关任务

要求：编写处理带词频属性的文档倒排索引程序，运行程序，对莎士比亚文集文档数据进行倒排索引处理，结果输出到指定文件。

注：输入输出文件的路径已经指定，

注释

import java.io.IOException;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.util.Iterator;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.GenericOptionsParser;

public class InvertedIndex {
	public static class InvertedIndexMapper extends Mapper<LongWritable, Text, Text, Text> 
	{
		public void map(LongWritable key, Text value, Context context)  
				throws IOException, InterruptedException 
		 
		{	
			FileSplit fileSplit = (FileSplit)context.getInputSplit();
			String fileName = fileSplit.getPath().getName();
			
			String word;
			IntWritable frequence=new IntWritable();
			int one=1;
			Hashtable<String,Integer>	hashmap=new Hashtable();//key关键字设置为String
			StringTokenizer itr = new StringTokenizer(value.toString());
			
//****请用hashmap定义的方法统计每一行中相同单词的个数，key为行值是每一行对应的偏移****//
/*********begin*********/

 for(;itr.hasMoreTokens(); ) 
            {   
                word=itr.nextToken();
                if(hashmap.containsKey(word)){
                    hashmap.put(word,hashmap.get(word)+1);
                }else{
                    hashmap.put(word, one);
                }
            }




/*********end**********/			
							

			for(Iterator<String> it=hashmap.keySet().iterator();it.hasNext();){
				word=it.next();
				frequence=new IntWritable(hashmap.get(word));
				Text fileName_frequence = new Text(fileName+"@"+frequence.toString());//以<K2,“单词 文件名@出现频次”> 的格式输出
				context.write(new Text(word),fileName_frequence);
			}
			
		}
	}

	public static class InvertedIndexCombiner extends Reducer<Text,Text,Text,Text>{
		protected void reduce(Text key,Iterable<Text> values,Context context)
						throws IOException ,InterruptedException{ 
//****请合并mapper函数的输出，并提取“文件@1”中‘@’后面的词频，以<K2,list(“单词 文件名@出现频次”)>的格式输出****//
/*********begin*********/
 String fileName="";
            int sum=0;
            String num;
            String s;
            for (Text val : values) {
                    s= val.toString();
                    fileName=s.substring(0, val.find("@"));
                    num=s.substring(val.find("@")+1, val.getLength());
                    sum+=Integer.parseInt(num);
            }
        IntWritable frequence=new IntWritable(sum);
        context.write(key,new Text(fileName+"@"+frequence.toString()));





/*********end**********/				

		}
	}
	
	public static class InvertedIndexReducer extends Reducer<Text, Text, Text, Text> 
	{	@Override
		protected void reduce(Text key, Iterable<Text> values, Context context)
				throws IOException, InterruptedException 
	 	{	Iterator<Text> it = values.iterator();
			StringBuilder all = new StringBuilder();
			if(it.hasNext())  all.append(it.next().toString());
			for(;it.hasNext();) {
				all.append(";");
				all.append(it.next().toString());					
			}
//****请输出最终键值对list(K3，“单词", “文件1@频次; 文件2@频次;...")****//
/*********begin*********/

            context.write(key, new Text(all.toString()));



/*********end**********/		
		}
	}

	public static void main(String[] args) 
    {
		if(args.length!=2){
			System.err.println("Usage: InvertedIndex <in> <out>");
			System.exit(2);
		}
		
      try {
		        Configuration conf = new Configuration();
		        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
		        
		        Job job = new Job(conf, "invertedindex");
				job.setJarByClass(InvertedIndex.class);
				job.setMapperClass(InvertedIndexMapper.class);
			//****请为job设置Combiner类****//
/*********begin*********/
                job.setCombinerClass(InvertedIndexCombiner.class);
/*********end**********/								
				job.setReducerClass(InvertedIndexReducer.class);
				
				job.setOutputKeyClass(Text.class);
			//****请设置输出value的类型****//
/*********begin*********/
                job.setOutputValueClass(Text.class);
/*********end**********/									
				FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
				FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
				
				System.exit(job.waitForCompletion(true) ? 0 : 1);
	 
        } catch (Exception e) { 
        	e.printStackTrace();
        }
    }
}

第四章网页排序——PageRank算法

前面我们关注了文本中词频的统计筛选出了文本中的高频词汇，也通过倒排索引建立了关键词和文本集间的索引关系。接下来从宏观地关注文本间的关系。文本间的引用最具代表性，比如网页的相互链接。显而易见某些文本（网页）被引用（链接）次数多更加重要，在面对海量的文本时可以以此对文本重要性进行排序，尽快地找到有用信息。

本关任务

要求：编写实现网页数据集PageRank算法的程序，对网页数据集进行处理得到网页权重排序。

完整PageRank计算公式

由于存在一些出链为0不链接任何其他网页的网页，因此需要对 PageRank公式进行修正，即在简单公式的基础上增加了阻尼系数（damping factor）q， q一般取值q=0.85

在这里插入图片描述

P1，P2,…,Pn是被研究的页面，M(Pi)是Pi链入页面的数量，L(Pj)是Pj链出页面的数量，而N是所有页面的数量。PageRank值是一个特殊矩阵中的特征向量。这个特征向量为：

在这里插入图片描述

PageRank计算过程

在这里插入图片描述

幂法计算过程如下：
X 设任意一个初始向量, 即设置初始每个网页的 PageRank值均。一般为1。R = AX。

   while  (1){
        if ( |X - R| < e) 
             return R;  //如果最后两次的结果近似或者相同，返回R
        else   {
                X =R;
               R = AX;
               }
    }

MapReduce计算PageRank

上面的演算过程，采用矩阵相乘，不断迭代，直到迭代前后概率分布向量的值变化不大，一般迭代到30次以上就收敛了。真的的web结构的转移矩阵非常大，目前的网页数量已经超过100亿，转移矩阵是100亿*100亿的矩阵，直接按矩阵乘法的计算方法不可行，需要借助Map-Reduce的计算方式来解决

对于如下图所示的相互链接网页关系
在这里插入图片描述

可以利用转移矩阵进行表示。转移矩阵是一个多维的稀疏矩阵，把web图中的每一个网页及其链出的网页作为一行，这样第四节中的web图结构用如下方式表示：

1. A   B    C    D
2. B   A    D
3. C   C
4. D   B    C

可以看A有三条出链，分布指向A、B、C，实际上爬取的网页结构数据就是这样的。
1.Map阶段
Map操作的每一行，对所有出链发射当前网页概率值的1/k，k是当前网页的出链数，比如对第一行输出<B，1/31/4>,<C，1/31/4>,<D，1/31/4>;
2、Reduce阶段
Reduce操作收集网页id相同的值，累加并按权重计算，pj=a(p1+p2+…Pm)+(1-a)*1/n，其中m是指向网页j的网页j数，n所有网页数。
思路就是这么简单，但是实践的时候，怎样在Map阶段知道当前行网页的概率值，需要一个单独的文件专门保存上一轮的概率分布值，先进行一次排序，让出链行与概率值按网页id出现在同一Mapper里面，整个流程如下：
在这里插入图片描述

这样进行一次迭代相当于需要两次MapReduce，但第一次的MapReduce只是简单的排序，不需要任何操作，用java调用Hadoop的Streaming.

编程要求

本关的编程任务是补全右侧代码片段中map和reduce函数中的代码，具体要求及说明如下：

在主函数main中已初始化hadoop的系统设置，包括hadoop运行环境的连接。
在main函数中，已经设置好了待处理文档路径（即input），在评测中设置了结果输出路径（即output），不要修改循环输出路径即可保证完成。
在main函数中，已经声明了job对象，程序运行的工作调度已经设定好。
原则上循环迭代次数越多越精准，但是为了保证平台资源，只允许运行5次迭代，多余过程被忽略无法展示，请勿增加循环次数。
本关只要求在map和reduce函数的指定区域进行代码编写，其他区域请勿改动。

测试说明
输入文件格式如下：

1    1.0 2 3 4 5 6 7 8
2    2.0 3 4 5 6 7 8
3    3.0 4 5 6 7 8
4    4.0 5 6 7 8
5    5.0 6 7 8
6    6.0 7 8
7    7.0 8
8    8.0 1 2 3 4 5 6 7

注：为了简化运算，已经对网页集关系进行了规整，并且给出了相应的初始PR值。
以第一行为例： 1表示网址（以tab键隔开），1.0为给予的初始pr值，2，3，4，5，6，7，8为从网址1指向的网址。
输出文件格式：

The origin result
1    1.0 2 3 4 5 6 7 8
2    2.0 3 4 5 6 7 8
3    3.0 4 5 6 7 8
4    4.0 5 6 7 8
5    5.0 6 7 8
6    6.0 7 8
7    7.0 8
8    8.0 1 2 3 4 5 6 7
The 1th result
1    0.150 1.121 _2 3 4 5 6 7 8 
2    0.150 1.243 _3 4 5 6 7 8 
3    0.150 1.526 _4 5 6 7 8 
4    0.150 2.036 _5 6 7 8 
5    0.150 2.886 _6 7 8 
6    0.150 4.303 _7 8 
7    0.150 6.853 _8 
8    0.150 11.831 _1 2 3 4 5 6 7 
The 2th result
1    0.150 1.587 _2 3 4 5 6 7 8 
2    0.150 1.723 _3 4 5 6 7 8 
3    0.150 1.899 _4 5 6 7 8 
4    0.150 2.158 _5 6 7 8 
5    0.150 2.591 _6 7 8 
6    0.150 3.409 _7 8 
7    0.150 5.237 _8 
8    0.150 9.626 _1 2 3 4 5 6 7 
The 3th result
1    0.150 1.319 _2 3 4 5 6 7 8 
2    0.150 1.512 _3 4 5 6 7 8 
3    0.150 1.756 _4 5 6 7 8 
4    0.150 2.079 _5 6 7 8 
5    0.150 2.537 _6 7 8 
6    0.150 3.271 _7 8 
7    0.150 4.720 _8 
8    0.150 8.003 _1 2 3 4 5 6 7 
The 4th result
1    0.150 1.122 _2 3 4 5 6 7 8 
2    0.150 1.282 _3 4 5 6 7 8 
3    0.150 1.496 _4 5 6 7 8 
4    0.150 1.795 _5 6 7 8 
5    0.150 2.236 _6 7 8 
6    0.150 2.955 _7 8 
7    0.150 4.345 _8 
8    0.150 7.386 _1 2 3 4 5 6 7 
The 5th result
1    0.150 1.047 _2 3 4 5 6 7 8 
2    0.150 1.183 _3 4 5 6 7 8 
3    0.150 1.365 _4 5 6 7 8 
4    0.150 1.619 _5 6 7 8 
5    0.150 2.000 _6 7 8 
6    0.150 2.634 _7 8 
7    0.150 3.890 _8 
8    0.150 6.686 _1 2 3 4 5 6 7

注:迭代方法和次数不同会对结果产生影响，不必完全与答案匹配，只需运行结果趋于合理即可。（第二列为多余值）

注：由于启动服务、编译、循环迭代等耗时，以及单次MapReduce过程资源消耗较大且时间较长，因而单个用户使用资源有限，评测时间较长（40s左右）！
请耐心等待！相信自己！通往成功的路上不会太久！

慧眼识珍发现有用的文本信息，请先从网页排序分析开始！

开始你的任务吧，祝你成功！

如果你觉得这一关的内容对你有帮助，请你在下面点赞。

import java.io.IOException;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.StringTokenizer;
import java.util.Iterator;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class PageRank {

  public static class MyMapper   extends Mapper<Object, Text, Text, Text>
  {
	  	private Text id = new Text();
  	 	public void map(Object key, Text value, Context context ) throws IOException, InterruptedException
  	 	{
  	 		String line = value.toString();
//判断是否为输入文件
  	 		if(line.substring(0,1).matches("[0-9]{1}"))
  	 		{
				  boolean flag = false;
				  if(line.contains("_"))
				  {
						line = line.replace("_","");
						flag = true;
				  }
//对输入文件进行处理
				  String[] values = line.split("\t");
				  Text t = new Text(values[0]);
				  String[] vals = values[1].split(" ");
				  String url="_";//保存url，用作下次计算
				  double pr = 0;
				  int i = 0;
				  int num = 0;
				  
				  if(flag)
				  {
					  i=2;
					  pr=Double.valueOf(vals[1]);
					  num=vals.length-2;
				  }
				  else
				  {
					  i=1;
					  pr=Double.valueOf(vals[0]);
					  num=vals.length-1;
				  }
				  
				  for(;i<vals.length;i++)
				  {
					  url=url+vals[i]+" ";
					  id.set(vals[i]);
					  Text prt = new Text(String.valueOf(pr/num));
					  context.write(id,prt);
				  }
				  context.write(t,new Text(url));
			  }
		  }
  }

  public static class MyReducer  extends Reducer<Text,Text,Text,Text>
  {
			  private Text result = new Text();
			  private Double pr = new Double(0);
			  
		 public void reduce(Text key, Iterable<Text> values,  Context context  ) throws IOException, InterruptedException
		 {
			  double sum=0;
			  String url="";
			  
//****请通过url判断否则是外链pr，作计算前预处理****//
/*********begin*********/
   for(Text val:values)
              {
                  if(!val.toString().contains("_"))
                  {
                      sum=sum+Double.valueOf(val.toString());
                  }
                  else
                 {
                      url=val.toString();
                  }
              }


/*********end**********/			
	  
  			 
//****请补全用完整PageRank计算公式计算输出过程，q取0.85****//
/*********begin*********/
 pr=0.15+0.85*sum;
              String str=String.format("%.3f",pr);
              result.set(new Text(str+" "+url));
              context.write(key,result);



/*********end**********/	

		  }
 }

	public static void main(String[] args) throws Exception
	{
		     String paths="file:///tmp/input/Wiki0";//输入文件路径，不要改动
			String path1=paths;
			String path2="";

			for(int i=1;i<=5;i++)//迭代5次
			  {
				System.out.println("This is the "+i+"th job!");
				System.out.println("path1:"+path1);
				System.out.println("path2:"+path2);
				Configuration conf = new Configuration();
				Job job = new Job(conf, "PageRank");
				path2=paths+i;	  
				job.setJarByClass(PageRank.class);
				job.setMapperClass(MyMapper.class);
		//****请为job设置Combiner类****//
/*********begin*********/
                job.setCombinerClass(MyReducer.class);

/*********end**********/					
				job.setReducerClass(MyReducer.class);
				job.setOutputKeyClass(Text.class);
				job.setOutputValueClass(Text.class);
				FileInputFormat.addInputPath(job, new Path(path1));
 				FileOutputFormat.setOutputPath(job, new Path(path2));
				path1=path2;	  
			 job.waitForCompletion(true);
			System.out.println(i+"th end!");
		}
	  }	
 }

本文最后，给出上述内容源码

test 3
import java.io.IOException;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.util.Iterator;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.GenericOptionsParser;

public class InvertedIndex {
    public static class InvertedIndexMapper extends Mapper<LongWritable, Text, Text, Text> 
    {
        public void map(LongWritable key, Text value, Context context)  
                throws IOException, InterruptedException 
        {    
            FileSplit fileSplit = (FileSplit)context.getInputSplit();
            String fileName = fileSplit.getPath().getName();
            String word;
            IntWritable frequence=new IntWritable();
            int one=1;
            Hashtable<String,Integer>    hashmap=new Hashtable();
            StringTokenizer itr = new StringTokenizer(value.toString());

            for(;itr.hasMoreTokens(); ) 
            {   
                word=itr.nextToken();
                if(hashmap.containsKey(word)){
                    hashmap.put(word,hashmap.get(word)+1);
                }else{
                    hashmap.put(word, one);
                }
            }

            for(Iterator<String> it=hashmap.keySet().iterator();it.hasNext();){
                word=it.next();
                frequence=new IntWritable(hashmap.get(word));
                Text fileName_frequence = new Text(fileName+"@"+frequence.toString());    
                context.write(new Text(word),fileName_frequence);
            }
        }
    }
    public static class InvertedIndexCombiner extends Reducer<Text,Text,Text,Text>{
        protected void reduce(Text key,Iterable<Text> values,Context context)
                        throws IOException ,InterruptedException{ 
            String fileName="";
            int sum=0;
            String num;
            String s;
            for (Text val : values) {
                    s= val.toString();
                    fileName=s.substring(0, val.find("@"));
                    num=s.substring(val.find("@")+1, val.getLength());
                    sum+=Integer.parseInt(num);
            }
        IntWritable frequence=new IntWritable(sum);
        context.write(key,new Text(fileName+"@"+frequence.toString()));
        }
    }
    public static class InvertedIndexReducer extends Reducer<Text, Text, Text, Text> 
    {    @Override
        protected void reduce(Text key, Iterable<Text> values, Context context)
                throws IOException, InterruptedException 
         {    Iterator<Text> it = values.iterator();
            StringBuilder all = new StringBuilder();
            if(it.hasNext())  all.append(it.next().toString());
            for(;it.hasNext();) {
                all.append(";");
                all.append(it.next().toString());                    
            }
            context.write(key, new Text(all.toString()));
        }
    }
    public static void main(String[] args) 
    {
        if(args.length!=2){
            System.err.println("Usage: InvertedIndex <in> <out>");
            System.exit(2);
        }
      try {
                Configuration conf = new Configuration();
                String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
                Job job = new Job(conf, "invertedindex");
                job.setJarByClass(InvertedIndex.class);
                job.setMapperClass(InvertedIndexMapper.class);
                job.setCombinerClass(InvertedIndexCombiner.class);
                job.setReducerClass(InvertedIndexReducer.class);
                job.setOutputKeyClass(Text.class);
                job.setOutputValueClass(Text.class);
                FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
                FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
                System.exit(job.waitForCompletion(true) ? 0 : 1);
        } catch (Exception e) { 
            e.printStackTrace();
        }
    }
}

test 4
import java.io.IOException;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.StringTokenizer;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class PageRank {
  public static class MyMapper   extends Mapper<Object, Text, Text, Text>
  {
          private Text id = new Text();
           public void map(Object key, Text value, Context context ) throws IOException, InterruptedException
           {
               String line = value.toString();
               if(line.substring(0,1).matches("[0-9]{1}"))
               {
                  boolean flag = false;
                  if(line.contains("_"))
                  {
                        line = line.replace("_","");
                        flag = true;
                  }
                  String[] values = line.split("\t");
                  Text t = new Text(values[0]);
                  String[] vals = values[1].split(" ");
                  String url="_";
                  double pr = 0;
                  int i = 0;
                  int num = 0;
                  if(flag)
                  {
                      i=2;
                      pr=Double.valueOf(vals[1]);
                      num=vals.length-2;
                  }
                  else
                  {
                      i=1;
                      pr=Double.valueOf(vals[0]);
                      num=vals.length-1;
                  }
                  for(;i<vals.length;i++)
                  {
                      url=url+vals[i]+" ";
                      id.set(vals[i]);
                      Text prt = new Text(String.valueOf(pr/num));
                      context.write(id,prt);
                  }
                  context.write(t,new Text(url));
              }
          }
  }
  public static class MyReducer  extends Reducer<Text,Text,Text,Text>
  {
              private Text result = new Text();
              private Double pr = new Double(0);
         public void reduce(Text key, Iterable<Text> values,  Context context  ) throws IOException, InterruptedException
         {
              double sum=0;
              String url="";

              for(Text val:values)
              {
                  if(!val.toString().contains("_"))
                  {
                      sum=sum+Double.valueOf(val.toString());
                  }
                  else
                 {
                      url=val.toString();
                  }
              }
              pr=0.15+0.85*sum;
              String str=String.format("%.3f",pr);
              result.set(new Text(str+" "+url));
              context.write(key,result);

          }
 }
    public static void main(String[] args) throws Exception
    {
             String paths="file:///tmp/input/Wiki0";
            String path1=paths;
            String path2="";
            for(int i=1;i<=20;i++)
              {
                System.out.println("This is the "+i+"th job!");
                System.out.println("path1:"+path1);
                System.out.println("path2:"+path2);
                Configuration conf = new Configuration();
                Job job = new Job(conf, "PageRank");
                path2=paths+i;      
                job.setJarByClass(PageRank.class);
                job.setMapperClass(MyMapper.class);
                job.setCombinerClass(MyReducer.class);
                job.setReducerClass(MyReducer.class);
                job.setOutputKeyClass(Text.class);
                job.setOutputValueClass(Text.class);
                FileInputFormat.addInputPath(job, new Path(path1));
                 FileOutputFormat.setOutputPath(job, new Path(path2));
                path1=path2;      
             job.waitForCompletion(true);
            System.out.println(i+"th end!");
        }
      }    
 }

如是Rushy

关注

5
点赞
踩
14

收藏

觉得还不错? 一键收藏
0
评论
倒排索引和利用PageRank算法进行网页排序

第三章倒排索引前面通过词频统计，已经可以找出高频率的“关键词”了，这些词汇出现的频率很高以至于很难直接对其所在的文档进行查找。必须借助一定的关系模型表示单词与文本的关系，然后才可以实现快速搜索查找。单词-文档矩阵是表达这种包含关系的最简洁的概念模型。每列代表文档包含了哪些单词，比如文档1包含了词汇1和词汇4，而不包含其它单词。每行代表了哪些文档包含了某个单词。比如对于词汇1来说，文档1和文档4中出现过单词1，而其它文档不包含词汇1。搜索引擎的索引其实就是实现“单词-文档矩阵”的具体数据结构。可以有
复制链接

扫一扫