由于百度博客http://hi.baidu.com/jrckkyy发表文章字数有限,以后原创文章全部都先发表到csdn和cu上,再发表到百度博客上,百度博客除了放原创的文章还主要放网上寻找到的优秀文章。
本着黑客精神我将陆续把最近分析注释TSE搜索引擎的心得发布出来,老鸟,大虾,大牛,高手飘过就是了,若愿意浪费指点下小弟的在下不甚感激,有问题的朋友直接留言讨论。由于本人水平有限,分析和翻译难免有错大家见笑了。
上学期拜读了James F.Kurose著的《计算机网络-自顶向下方法与internet特色(第三版阴影)》,觉得写得确实不错(希望没看的朋友一定要买来看看),自己也来搞个高自顶向下的学习方法,先从用户看得到的东西出发分析研究搜索引擎,下面我们就来看看各大搜索引擎搜索界面的代码,你所需要特别注意的是form表单中的action
雅虎http://www.yohoo.com/:
<
form
name
=s1
style
="margin-bottom:0"
action
="http://search.yahoo.com/search"
>
<
table
cellpadding
=0
cellspacing
=0
border
=0
><
tr
><
td
>
<
input
type
=text
size
=30
name
=p
title
="enter search terms here"
>
<
input
type
=submit
value
=Search
>
</
td
><
td
><
font
face
=arial
size
=-2
>
·
<
a
href
="http://search.yahoo.com/search/options?p="
>
advanced search
</
a
><
br
>
·
<
a
href
="http://buzz.yahoo.com/"
>
most popular
</
a
></
font
></
td
></
tr
></
table
></
form
>
谷歌http://www.g.cn:
<
form
method
=GET
action
=/search
><
tr
><
td
nowrap
>
<
font
size
=-1
><
input
type
=text
name
=q
size
=41
maxlength
=2048
value
="jrckkyy"
title
="Google 搜索"
>
<
input
type
=submit
name
=btnG
value
="Google 搜索"
><
input
type
=hidden
name
=complete
value
=1
><
input
type
=hidden
name
=hl
value
="zh-CN"
><
input
type
=hidden
name
=newwindow
value
=1
><
input
type
=hidden
name
=sa
value
="2"
></
font
></
td
></
tr
></
form
>
百度http://www.baidu.com:
<
form
name
=f2
action
="/s"
>
<
tr
valign
="middle"
>
<
td
nowrap
>
<
input
type
=hidden
name
=ct
value
="0"
>
<
input
type
=hidden
name
=ie
value
="gb2312"
>
<
input
type
=hidden
name
=bs
value
="jrckkyy"
>
<
input
type
=hidden
name
=sr
>
<
input
type
=hidden
name
=z
value
=""
>
<
input
type
=hidden
name
=cl
value
=3
>
<
input
type
=hidden
name
=f
value
=8
>
<
input
name
=wd
size
="35"
class
=i
value
="jrckkyy"
maxlength
=100
>
<
input
type
=submit
value
=百度一下
>
<
input
type
=button
value
=结果中找
onclick
="return bq(f2,1,0);"
>
</
td
>
<
td
nowrap
><
a
href
="http://utility.baidu.com/quality/quality_form.php?word=jrckkyy"
>
与百度对话
</
a
></
td
>
</
tr
>
</
form
>
天网http://www.tianwang.com/:
<
form
name
=f
action
="/cgi-bin/tw"
method
=get
>
<
td
valign
=center
width
=634
background
=images/index_image_02.gif
>
<
table
height
=46
cellspacing
=0
cellpadding
=0
width
=600
align
=right
border
=0
>
<
tbody
>
<
tr
>
<
td
height
=50
>
<
table
cellspacing
=0
cellpadding
=0
width
=600
border
=0
>
<
tbody
>
<
tr
>
<
td
width
="524"
height
="30"
valign
="bottom"
>
<
div
align
="center"
>
<
input
name
="word"
type
="text"
size
="40"
maxlength
="255"
onClick
="this.focus();checkWord(this,1)"
onblutesr
='checkWord(this,0)'
value
='请输入资源名称'
>
<
font
color
=#ffffff
>
<
select
onChange
=reRange(this.selectedIndex)
name
=range
>
![](https://i-blog.csdnimg.cn/blog_migrate/a41954a27d6ad96fa2c2cf816e677448.gif)
<
script
language
=javascript
>
...
<!--
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
for(var i = 0; i < rescode.length; i++) ...{
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
if(i == 0) ...{
document.write('<option value="0" selected>' + rescode[i][0] + '</option>');
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
} else ...{
document.write('<option value="' + i + '">' + rescode[i][0] + '</option>');
}
}
document.f.range.selectedIndex = 0;
-->
</
script
>
</
select
>
</
font
>
-
<
font
color
=#ffffff
>
<
select
name
=cd
>
![](https://i-blog.csdnimg.cn/blog_migrate/a41954a27d6ad96fa2c2cf816e677448.gif)
<
script
language
=javascript
>
...
<!--
var ind = document.f.range.selectedIndex;
var len = (rescode[ind].length - 1) / 2;
var sel = 0;
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
for(var i = 0; i < len; i++) ...{
document.write('<option value="' + rescode[ind][2*i+1] + '">' + rescode[ind][2*i+2] + '</option>');
if(rescode[ind][2*i+1] == 0)
sel = i;
}
document.f.cd.selectedIndex = sel;
-->
</
script
>
</
select
>
</
font
></
div
>
</
td
>
<
td
width
="71"
valign
="bottom"
><
input
id
=submit2
type
=image
height
=22
width
=40
src
="images/so2.gif"
align
=absMiddle
name
=submit
></
td
>
</
tr
>
<
tr
>
<
td
colspan
=3
height
=25
class
=style16
>
<
div
align
=center
></
div
>
</
td
>
</
tr
>
</
tbody
>
</
table
>
</
td
>
</
tr
>
</
tbody
>
</
table
>
</
td
>
</
form
>
测试服务器TSE:
<
form
method
="get"
action
="/cgi-bin/index/TSESearch"
name
="tw"
>
<
td
width
="100%"
height
="25"
align
="center"
>
<
input
type
="text"
name
="word"
size
="55"
>
<
input
type
="submit"
value
=" 搜索"
name
="www"
>
</
td
>
<
input
type
="hidden"
name
="cdtype"
value
="GB"
>
</
form
>
由以上几个form的属性可以看出全部采用的是get方法,CGI做为处理程序,也就是C/C++,CGI全称是“公共网关界面”(Common Gateway Interface),HTTP服务器与你的或其它机器上的程序进行“交谈”的一种工具,其程序须运行在网络服务器上。CGI逐渐被近几年来的PHP,JAVA,ASP,PERL,Python,Ruby等动态语言所取代。但是其在速度和运行效率上的优势是无法取代的。
以下是TSE CGI入口程序注释,其他搜索引擎的入口也应该类似
/**/
/**
* 程序翻译说明
* @Copyright (c) 2008, 研发部
* All rights reserved.
*
* @filesource TSESearch.cpp
* @author jrckkyy <jrckkyy@163.com>
*
* Let's start
*
*/
#include
<
stdio.h
>
#include
<
stdlib.h
>
#include
<
string
.h
>
#include
<
sys
/
types.h
>
#include
<
sys
/
stat.h
>
#include
<
fcntl.h
>
#include
<
sys
/
time.h
>
#include
<
unistd.h
>
![](https://i-blog.csdnimg.cn/blog_migrate/6810355c2f78c12e91b7997a8e8c583a.gif)
#include
<
iostream
>
#include
<
fstream
>
#include
<
list
>
![](https://i-blog.csdnimg.cn/blog_migrate/6810355c2f78c12e91b7997a8e8c583a.gif)
#include
"
Comm.h
"
//
包含2个索引和1个数据文件
#include
"
Query.h
"
//
包含数据查询处理头文件
#include
"
Document.h
"
//
html文档处理头文件
#include
"
StrFun.h
"
//
字符串处理头文件
#include
"
ChSeg/Dict.h
"
//
字元字典处理头文件
#include
"
ChSeg/HzSeg.h
"
#include
"
DisplayRst.h
"
//
返回查询结果页面头文件,返回结果分为头部,中部,底部
using
namespace
std;
![](https://i-blog.csdnimg.cn/blog_migrate/6810355c2f78c12e91b7997a8e8c583a.gif)
![](https://i-blog.csdnimg.cn/blog_migrate/a41954a27d6ad96fa2c2cf816e677448.gif)
/**/
/*
* A inverted file(INF) includes a term-index file & a inverted-lists file.
* A inverted-lists consists of many bucks(posting lists).
* The term-index file is stored at vecTerm, and
* the inverted-lists is sored at mapBuckets.
*/
![](https://i-blog.csdnimg.cn/blog_migrate/6810355c2f78c12e91b7997a8e8c583a.gif)
![](https://i-blog.csdnimg.cn/blog_migrate/a41954a27d6ad96fa2c2cf816e677448.gif)
/**/
/**
* 程序翻译说明
* 搜索程序入口前台关键字提交到该cgi程序 例如:./cgi-bin/index/TSESearch?word=123&start=1
* 倒排文件包括一个记录检索词文件和一个倒排列表文件。
* 倒排列表包含很多标志(提交名单)。
* 记录检索词文件使用vecTerm来排序,和倒排列表是用mapBuckets来排序。
*
* @access public
* @param int char 参数的汉字说明 用于接收前台get传递的参数
* @return string 0
*/
int
main(
int
argc,
char
*
argv[])
![](https://i-blog.csdnimg.cn/blog_migrate/a41954a27d6ad96fa2c2cf816e677448.gif)
...
{
struct timeval begin_tv, end_tv;
struct timezone tz;
![](https://i-blog.csdnimg.cn/blog_migrate/6a9c071a08f1dae2d3e1c512000eef41.gif)
CDict iDict;
map<string, string> dictMap, mapBuckets;
vector<DocIdx> vecDocIdx; //Document。h
![](https://i-blog.csdnimg.cn/blog_migrate/6a9c071a08f1dae2d3e1c512000eef41.gif)
CQuery iQuery;
iQuery.GetInputs(); //具体程序开始执行
// current query & result page number
iQuery.SetQuery();
iQuery.SetStart();
![](https://i-blog.csdnimg.cn/blog_migrate/6a9c071a08f1dae2d3e1c512000eef41.gif)
// begin to search
//开始具体搜索程序
gettimeofday(&begin_tv,&tz); //开始计时获取程序运行时间差
![](https://i-blog.csdnimg.cn/blog_migrate/6a9c071a08f1dae2d3e1c512000eef41.gif)
iQuery.GetInvLists(mapBuckets); //将所有字符集存入映射变量中 瓶颈所在
iQuery.GetDocIdx(vecDocIdx); //将倒排索引存入向量中 瓶颈所在
CHzSeg iHzSeg; //include ChSeg/HzSeg.h
iQuery.m_sSegQuery = iHzSeg.SegmentSentenceMM(iDict, iQuery.m_sQuery); //将get到的查询变量分词分成 "我/ 爱/ 你们/ 的/ 格式"
vector<string> vecTerm;
iQuery.ParseQuery(vecTerm); //将以"/"划分开的关键字一一顺序放入一个向量容器中
set<string> setRelevantRst;
iQuery.GetRelevantRst(vecTerm, mapBuckets, setRelevantRst);
gettimeofday(&end_tv,&tz);
// search end
//搜索完毕
![](https://i-blog.csdnimg.cn/blog_migrate/6a9c071a08f1dae2d3e1c512000eef41.gif)
//下面开始显示
CDisplayRst iDisplayRst;
iDisplayRst.ShowTop();
![](https://i-blog.csdnimg.cn/blog_migrate/6a9c071a08f1dae2d3e1c512000eef41.gif)
float used_msec = (end_tv.tv_sec-begin_tv.tv_sec)*1000
+((float)(end_tv.tv_usec-begin_tv.tv_usec))/(float)1000;
![](https://i-blog.csdnimg.cn/blog_migrate/6a9c071a08f1dae2d3e1c512000eef41.gif)
iDisplayRst.ShowMiddle(iQuery.m_sQuery,used_msec,
setRelevantRst.size(), iQuery.m_iStart);
![](https://i-blog.csdnimg.cn/blog_migrate/6a9c071a08f1dae2d3e1c512000eef41.gif)
iDisplayRst.ShowBelow(vecTerm,setRelevantRst,vecDocIdx,iQuery.m_iStart);
![](https://i-blog.csdnimg.cn/blog_migrate/6a9c071a08f1dae2d3e1c512000eef41.gif)
return 0;
![](https://i-blog.csdnimg.cn/blog_migrate/6a9c071a08f1dae2d3e1c512000eef41.gif)
}
http://jrckkyy.cublog.cn (http://blog.chinaunix.net/u2/83460/)
http://blog.csdn.net/jrckkyy
http://hi.baidu.com/jrckkyy