自顶向下学搜索引擎——北大天网搜索引擎TSE分析及完全注释[2]路过查询处理程序

由上一篇文章 [原]自顶向下学搜索引擎——北大天网搜索引擎TSE分析及完全注释[1]寻找搜索引擎入口 我们可以知道整个程序是从TSESearch.cpp 中的main函数开始的我们重点一下这段代码

//TSESearch.cpp
	CQuery iQuery;
	iQuery.GetInputs();		//具体程序开始执行
	// current query & result page number
	iQuery.SetQuery();
	iQuery.SetStart();

	// begin to search
	//开始具体搜索程序
	gettimeofday(&begin_tv,&tz);	//开始计时获取程序运行时间差

	iQuery.GetInvLists(mapBuckets);		//将所有字符集存入映射变量中	瓶颈所在
	iQuery.GetDocIdx(vecDocIdx);		//将倒排索引存入向量中		瓶颈所在
	
	CHzSeg iHzSeg;		//include ChSeg/HzSeg.h
	iQuery.m_sSegQuery = iHzSeg.SegmentSentenceMM(iDict, iQuery.m_sQuery);	//将get到的查询变量分词分成 "我/		爱/		你们/	的/		格式"
	
	vector
  
   vecTerm;
	iQuery.ParseQuery(vecTerm);		//将以"/"划分开的关键字一一顺序放入一个向量容器中
	
	set
  
   setRelevantRst; 
	iQuery.GetRelevantRst(vecTerm, mapBuckets, setRelevantRst); 
	
	gettimeofday(&end_tv,&tz);
	// search end
	//搜索完毕

 

按照顺序我们首先深入进iQuery对象的类CQuery  

//Query.cpp

1、GetInputs

这个方法的功能是将前台get过来的变量转换到HtmlInputs结构体数组中如下例子和代码:

//假设前台查询的关键字是"1"着HtmlInputs中内容输出如下  //HtmlInputs[0].Name word  //HtmlInputs[0].Value 1  //HtmlInputs[1].Name www  //HtmlInputs[1].Value 搜索  //HtmlInputs[2].Name cdtype  //HtmlInputs[2].Value GB

 
/* 
 * Get form information throught environment varible.
 * return 0 if succeed, otherwise exit.
 */
/**
 * 程序翻译说明
 * 处理GET过来的表单
 *
 * @access  public
 * @return  string 0
 */
int CQuery::GetInputs()
{
    int i,j;
	char *mode = getenv("REQUEST_METHOD");	//返回环境变量的值 这里环境变量 REQUEST_METHOD 为 get 方法
    char *tempstr;	//GET变量字符串或POST字符串内容
	char *in_line;	
	int length;		//GET变量串长度或POST内容长度

	cout << "Content-type: text/html/n/n";
	//cout << "Cache-Control: no-cache/n";
	//cout << "Expires: Tue, 08 Apr 1997 17:20:00 GMT/n";
	//cout << "Expires: 0/n";
	//cout << "Pragma: no-cache/n/n";

	cout << "/n";
	cout << "/n";
	//cout << "
   
   /n";
	//cout << "
   
   /n";
	//cout << "
   
   /n";
	cout << "/n";
	cout.flush();	//释放输出缓冲区 输出头部head和之前的html标签内容
	//cout <<" href="/"style.css/"" rel="stylesheet" />" << endl;

	if (mode==NULL) return 1;

	if (strcmp(mode, "POST") == 0) 
	{
		length = atoi(getenv("CONTENT_LENGTH"));	//如果是POST方法着获得环境变量CONTENT_LENGTH的整型值
		if (length==0 || length>=256)
			return 1;
		in_line = (char*)malloc(length + 1);
		read(STDIN_FILENO, in_line, length);
		in_line[length]='/0';
	} 
	else if (strcmp(mode, "GET") == 0) 
	{
		char* inputstr = getenv("QUERY_STRING");	//如果是GET方法着获得环境变量QUERY_STRING的字符串值
		length = strlen(inputstr);
		if (inputstr==0 || length>=256)
			return 1;

		//获取get内容长度并把get ?后面的参数赋值给变量in_line
		in_line = (char*)malloc(length + 1);
		strcpy(in_line, inputstr);	//小心溢出攻击
	}


	tempstr = (char*)malloc(length + 1);	//获取post内容或get内容长度
	if(tempstr == NULL)
	{
		printf("/n");
		printf("/n");
		printf("

Major failure #1;please notify the webmaster/n"); printf("

/n"); fflush(stdout); //输出缓冲区 exit(2); //错误返回 } j=0; for (i=0; i char strcpy(HtmlInputs[HtmlInputCount].Name,tempstr); if (i == length - 1) { strcpy(HtmlInputs[HtmlInputCount].Value,""); HtmlInputCount++; } j=0; } else if ((in_line[i] == '&') || (i==length-1)) { if (i==length-1) { if(in_line[i] == '+')tempstr[j]=' '; else tempstr[j] = in_line[i]; j++; } tempstr[j]='/0'; CStrFun::Translate(tempstr); //将URL编码形式的参数转换成字符型 %** -> char strcpy(HtmlInputs[HtmlInputCount].Value,tempstr); HtmlInputCount++; j=0; } else if (in_line[i] == '+') { tempstr[j]=' '; j++; } else { tempstr[j]=in_line[i]; //组合get中的变量如word www cdtype j++; } //cout<"; //cout<"; //cout.flush(); } /* for (int kk = 0; kk < HtmlInputCount ; ++kk ) { cout<<"Name="<"; cout<<"Value="<"; } //假设前台查询的关键字是"1"输出如下 //Name=word //Value=1 //Name=www //Value= 搜索 //Name=cdtype //Value=GB */ if(in_line) free(in_line); if(tempstr) free(tempstr); return 0; }
 
2、SetQuery
 
//Query.cpp
void CQuery::SetQuery()
{
	string q = HtmlInputs[0].Value;
	CStrFun::Str2Lower(q,q.size());	//大写变小写
	m_sQuery = q;		//准备查询关键字
}
3、SetStart
void CQuery::SetQuery()
{
	string q = HtmlInputs[0].Value;
	CStrFun::Str2Lower(q,q.size());	//大写变小写word变量里的值
	m_sQuery = q;		//设置查询关键字
}
4、GetInvLists
 bool CQuery::GetInvLists(map<string, string> &mapBuckets) const
{
 ifstream ifsInvInfo(INF_INFO_NAME.c_str(), ios::binary); //以二进制形式打开一个文件的输入流缓冲,INF_INFO_NAME在头文件Comm.h中定义了的, const string INF_INFO_NAME("./Data/sun.iidx"); 
 //倒排索引文件索引字和文档好之间有一个制表符"/t"
 //朱德  14383 16151 16151 16151 1683 207 6302 7889 8218 8218 8637
 //朱古力  1085 1222
 
 if (!ifsInvInfo) {
  cerr << "Cannot open " << INF_INFO_NAME << " for input/n";
  return false;
 }
 string strLine, strWord, strDocNum;
 //以行读取输入流缓冲到字符串对象strLine中并做处理
 while (getline(ifsInvInfo, strLine)) {
  string::size_type idx;
  string tmp;
  idx = strLine.find("/t");
  strWord = strLine.substr(0,idx);
  strDocNum = strLine.substr(idx+1);
  mapBuckets.insert(map<string,string>::value_type (strWord, strDocNum)); //倒排表二项二维表存入映射中
  
  /*
  map<string, string>::iterator iter;
  int kkk = 0;
  for (iter = mapBuckets.begin(); kkk != 10; ++iter)
  {
   cout<<iter->first<<"  "<<iter->second<<"<br>";
   ++kkk;
  }
  cout.flush();
  */
 }
 return true;
}
 
5、GetDocIdx
 
bool CQuery::GetDocIdx(vector
     
      &vecDocIdx) const
{
	ifstream ifs(DOC_IDX_NAME.c_str(), ios::binary);	
	//0		0		bc9ce846d7987c4534f53d423380ba70
	//1		76760	4f47a3cad91f7d35f4bb6b2a638420e5
	//2		141624	d019433008538f65329ae8e39b86026c

	if (!ifs) { 
		cerr << "Cannot open " << DOC_IDX_NAME << " for input/n";	//以二进制形式打开一个文件的输入流缓冲,DOC_IDX_NAME在头文件Comm.h中定义了的, const string INF_INFO_NAME("./Data/Doc.idx");	
		return false; 
	} 

	string strLine, strDocid, strUrl; 
	while (getline(ifs,strLine)){
		DocIdx di;

		sscanf( strLine.c_str(), "%d%d", &di.docid, &di.offset );	//只保留了前面两项文档号和偏移量
		vecDocIdx.push_back(di);	//导入结构体向量中
	}

	return true;
}
 
 
TSE(Tiny Search Engine) ======================= (Temporary) Web home: http://162.105.80.44/~yhf/Realcourse/ TSE is free utility for non-interactive download of files from the Web. It supports HTTP. According to query word or url, it retrieve results from crawled pages. It can follow links in HTML pages and create output files in Tianwang (http://e.pku.edu.cn/) format or ISAM format files. Additionally, it provies link structures which can be used to rebuild the web frame. --------------------------- Main functions in the TSE: 1) normal crawling, named SE, e.g: crawling all pages in PKU scope. and retrieve results from crawled pages according to query word or url, 2) crawling images and corresponding pages, named ImgSE. --------------------------- INSTALL: 1) execute "tar xvfz tse.XXX.gz" --------------------------- Before running the program, note Note: The program is default for normal crawling (SE). For ImgSE, you should: 1. change codes with the following requirements, 1) In "Page.cpp" file, find two same functions "CPage::IsFilterLink(string plink)" One is for ImgSE whose urls must include "tupian", "photo", "ttjstk", etc. the other is for normal crawling. For ImgSE, remember to comment the paragraph and choose right "CPage::IsFilterLink(string plink)". For SE, remember to open the paragraph and choose righ "CPage::IsFilterLink(string plink)". 2) In Http.cpp file i. find "if( iPage.m_sContentType.find("image") != string::npos )" Comment the right paragraph. 3) In Crawl.cpp file, i. "if( iPage.m_sContentType != "text/html" Comment the right paragraph. ii. find "if(file_length < 40)" Choose right one line. iii. find "iMD5.GenerateMD5( (unsigned char*)iPage.m_sContent.c_str(), iPage.m_sContent.length() )" Comment the right paragraph. iv. find "if (iUrl.IsImageUrl(strUrl))" Comment the right paragraph. 2.sh Clean; (Note not remove link4History.url, you should commnet "rm -f link4History.url" line first) secondly use "link4History.url" as a seed file. "link4History" is produced while normal crawling (SE). --------------------------- EXECUTION: execute "make clean; sh Clean;make". 1) for normal crawling and retrieving ./Tse -c tse_seed.img According to query word or url, retrieve results from crawled pages ./Tse -s 2) for ImgSE ./Tse -c tse_seed.img After moving Tianwang.raw.* data to secure place, execute ./Tse -c link4History.url --------------------------- Detail functions: 1) suporting multithreads crawling pages 2) persistent HTTP connection 3) DNS cache 4) IP block 5) filter unreachable hosts 6) parsing hyperlinks from crawled pages 7) recursively crawling pages h) Outputing Tianwang format or ISAM format files --------------------------- Files in the package Tse --- Tse execute file tse_unreachHost.list --- unreachable hosts according to PKU IP block tse_seed.pku --- PKU seeds tse_ipblock --- PKU IP block ... Directories in the package hlink,include,lib,stack,uri directories --- Parse links from a page --------------------------- Please report bugs in TSE to MAINTAINERS: YAN Hongfei * Created: YAN Hongfei, Network lab of Peking University. * Created: July 15 2003. version 0.1.1 * # Can crawl web pages with a process * Updated: Aug 20 2003. version 1.0.0 !!!! * # Can crawl web pages with multithreads * Updated: Nov 08 2003. version 1.0.1 * # more classes in the codes * Updated: Nov 16 2003. version 1.1.0 * # integrate a new version linkparser provided by XIE Han * # according to all MD5 values of pages content, * for all the pages not seen before, store a new page * Updated: Nov 21 2003. version 1.1.1 * # record all duplicate urls in terms of content MD5
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Zda天天爱打卡

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值