搜索引擎模型

1 爬虫模块
2 索引模块
采用二元分词存储
3 搜索模块
3.1 asp.net界面
3.2 搜索方法
None.gif      private   void  Search()
ExpandedBlockStart.gifContractedBlock.gif    
dot.gif {
InBlock.gif        
//int startAt, len;
InBlock.gif
        string searchStr = this.Q;
InBlock.gif        
string prefix = this.T;
InBlock.gif        SearchTest searcher 
= new SearchTest();
InBlock.gif        DateTime start 
= DateTime.Now;
InBlock.gif
InBlock.gif        
// create the result DataTable
InBlock.gif
        this.Results.Columns.Add("title"typeof(string));
InBlock.gif        
this.Results.Columns.Add("content"typeof(string));
InBlock.gif        
this.Results.Columns.Add("url"typeof(string));
InBlock.gif
InBlock.gif        
if ((searchStr.IndexOf(" "== -1)&&searchStr.Length>3)
ExpandedSubBlockStart.gifContractedSubBlock.gif        
dot.gif{
InBlock.gif            List
<string> resultList = Sj110.Com.Chinese.Tokenizer.Tokenize(searchStr);
InBlock.gif            StringBuilder sb 
= new StringBuilder();
InBlock.gif            
foreach (string result in resultList)
ExpandedSubBlockStart.gifContractedSubBlock.gif            
dot.gif{
InBlock.gif                
bool bStop=false;
InBlock.gif                
foreach (string stop in m_stopWords)
InBlock.gif                    
if (result == stop)
ExpandedSubBlockStart.gifContractedSubBlock.gif                    
dot.gif
InBlock.gif                        bStop 
= true;
InBlock.gif                        
break;
ExpandedSubBlockEnd.gif                    }

InBlock.gif
InBlock.gif                
if (bStop == false)
ExpandedSubBlockStart.gifContractedSubBlock.gif                
dot.gif{
InBlock.gif                    sb.Append(result);
InBlock.gif                    sb.Append(
" ");
ExpandedSubBlockEnd.gif                }

InBlock.gif
InBlock.gif                
//sb.AppendFormat("{0} ", result);
ExpandedSubBlockEnd.gif
            }

InBlock.gif            sb.Remove(sb.Length 
- 11);
InBlock.gif            searchStr 
= sb.ToString();
ExpandedSubBlockEnd.gif        }

InBlock.gif
InBlock.gif        
try
ExpandedSubBlockStart.gifContractedSubBlock.gif        
dot.gif{
ExpandedSubBlockStart.gifContractedSubBlock.gif            
string[] fields = dot.gif"content""title" };
InBlock.gif            
//Hits h = searcher.search(searchStr, fields, prefix);
InBlock.gif
InBlock.gif            
//Hits h = searcher.search(searchStr, "content");
InBlock.gif
            Hits h = searcher.search(searchStr, prefix);
InBlock.gif            
//this.m_total = h.Length();
InBlock.gif
            this.m_total = GetValidLength(h);
InBlock.gif            
// initialize startAt
InBlock.gif
            this.m_startAt = initStartAt();
InBlock.gif
InBlock.gif            
// how many items we should show - less than defined at the end of the results
InBlock.gif
            int resultsCount = smallerOf(m_total, this.m_maxResults + this.m_startAt);
InBlock.gif            
// create highlighter
InBlock.gif

InBlock.gif            
if (h.Length() == 0)
ExpandedSubBlockStart.gifContractedSubBlock.gif            
dot.gif{
InBlock.gif                DataRow row 
= this.Results.NewRow();
InBlock.gif                row[
"title"= "您查询的关键字<font color=CC0033>" + searchStr + "</font>暂无结果。<br><br>提示:多个关键字之间请加空格。“<font color=black>公交 线路</font>”比“<font color=black>公交线路</font>”更容易搜到结果。";
InBlock.gif                row[
"url"= "default.aspx";
InBlock.gif                
this.Results.Rows.Add(row);
InBlock.gif                
return;
ExpandedSubBlockEnd.gif            }

InBlock.gif            
for (int i = m_startAt; i < resultsCount; i++)
ExpandedSubBlockStart.gifContractedSubBlock.gif            
dot.gif{
InBlock.gif                Document doc 
= h.Doc(i);
InBlock.gif
InBlock.gif                
string url = doc.Get("url");
InBlock.gif                
//if (url == m_oldUrl||url.EndsWith("/"))
InBlock.gif
                if (m_oldUrls.CheckRepeatUrl(url) || url.EndsWith("/"))
ExpandedSubBlockStart.gifContractedSubBlock.gif                
dot.gif{
InBlock.gif                    m_invalidCount
++;
InBlock.gif                    resultsCount
++;
InBlock.gif                    
continue;
ExpandedSubBlockEnd.gif                }

InBlock.gif                
//m_oldUrl = url;
InBlock.gif
                string content = doc.Get("content");                
InBlock.gif                
string title = doc.Get("title");
InBlock.gif                
if (title.Trim() == "") title = "无标题";
InBlock.gif
InBlock.gif                String[] searchArr 
= searchStr.Split(' ');
InBlock.gif
InBlock.gif                
//startAt = content.IndexOf(searchArr[0]);
InBlock.gif                
//startAt = startAt - 20;
InBlock.gif                
//startAt = (startAt < 0 ? 0 : startAt);
InBlock.gif                
//len = (startAt + 255 > content.Length ? content.Length - startAt : 255);
InBlock.gif                
//content = content.Substring(startAt, len);
InBlock.gif
                content = GetBestFragments(content, searchArr);
InBlock.gif                content 
= Hilighter(content, searchArr);
InBlock.gif                title 
= Hilighter(title, searchArr);
InBlock.gif                DataRow row 
= this.Results.NewRow();
InBlock.gif
InBlock.gif                row[
"title"= title;
InBlock.gif                row[
"content"= content;
InBlock.gif                row[
"url"= url;
InBlock.gif
InBlock.gif                
this.Results.Rows.Add(row);
InBlock.gif                
ExpandedSubBlockEnd.gif            }

InBlock.gif            
// result information
InBlock.gif
            this.m_duration = DateTime.Now - start;
InBlock.gif            
this.m_fromItem = this.m_startAt + 1;
InBlock.gif            
this.m_toItem = smallerOf(this.m_startAt + m_maxResults, m_total);
InBlock.gif
ExpandedSubBlockEnd.gif        }

InBlock.gif        
catch (Exception ex)
ExpandedSubBlockStart.gifContractedSubBlock.gif        
dot.gif{
InBlock.gif            Console.WriteLine(ex.Message);
InBlock.gif            
//throw;
InBlock.gif
            return;
ExpandedSubBlockEnd.gif        }

ExpandedBlockEnd.gif    }

   

转载于:https://www.cnblogs.com/jadepark/archive/2007/08/06/844982.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值