2 索引模块
采用二元分词存储
3 搜索模块
3.1 asp.net界面
3.2 搜索方法
private
void
Search()
{
//int startAt, len;
string searchStr = this.Q;
string prefix = this.T;
SearchTest searcher = new SearchTest();
DateTime start = DateTime.Now;
// create the result DataTable
this.Results.Columns.Add("title", typeof(string));
this.Results.Columns.Add("content", typeof(string));
this.Results.Columns.Add("url", typeof(string));
if ((searchStr.IndexOf(" ") == -1)&&searchStr.Length>3)
{
List<string> resultList = Sj110.Com.Chinese.Tokenizer.Tokenize(searchStr);
StringBuilder sb = new StringBuilder();
foreach (string result in resultList)
{
bool bStop=false;
foreach (string stop in m_stopWords)
if (result == stop)
{
bStop = true;
break;
}
if (bStop == false)
{
sb.Append(result);
sb.Append(" ");
}
//sb.AppendFormat("{0} ", result);
}
sb.Remove(sb.Length - 1, 1);
searchStr = sb.ToString();
}
try
{
string[] fields = { "content", "title" };
//Hits h = searcher.search(searchStr, fields, prefix);
//Hits h = searcher.search(searchStr, "content");
Hits h = searcher.search(searchStr, prefix);
//this.m_total = h.Length();
this.m_total = GetValidLength(h);
// initialize startAt
this.m_startAt = initStartAt();
// how many items we should show - less than defined at the end of the results
int resultsCount = smallerOf(m_total, this.m_maxResults + this.m_startAt);
// create highlighter
if (h.Length() == 0)
{
DataRow row = this.Results.NewRow();
row["title"] = "您查询的关键字<font color=CC0033>" + searchStr + "</font>暂无结果。<br><br>提示:多个关键字之间请加空格。“<font color=black>公交 线路</font>”比“<font color=black>公交线路</font>”更容易搜到结果。";
row["url"] = "default.aspx";
this.Results.Rows.Add(row);
return;
}
for (int i = m_startAt; i < resultsCount; i++)
{
Document doc = h.Doc(i);
string url = doc.Get("url");
//if (url == m_oldUrl||url.EndsWith("/"))
if (m_oldUrls.CheckRepeatUrl(url) || url.EndsWith("/"))
{
m_invalidCount++;
resultsCount++;
continue;
}
//m_oldUrl = url;
string content = doc.Get("content");
string title = doc.Get("title");
if (title.Trim() == "") title = "无标题";
String[] searchArr = searchStr.Split(' ');
//startAt = content.IndexOf(searchArr[0]);
//startAt = startAt - 20;
//startAt = (startAt < 0 ? 0 : startAt);
//len = (startAt + 255 > content.Length ? content.Length - startAt : 255);
//content = content.Substring(startAt, len);
content = GetBestFragments(content, searchArr);
content = Hilighter(content, searchArr);
title = Hilighter(title, searchArr);
DataRow row = this.Results.NewRow();
row["title"] = title;
row["content"] = content;
row["url"] = url;
this.Results.Rows.Add(row);
}
// result information
this.m_duration = DateTime.Now - start;
this.m_fromItem = this.m_startAt + 1;
this.m_toItem = smallerOf(this.m_startAt + m_maxResults, m_total);
}
catch (Exception ex)
{
Console.WriteLine(ex.Message);
//throw;
return;
}
}
{
//int startAt, len;
string searchStr = this.Q;
string prefix = this.T;
SearchTest searcher = new SearchTest();
DateTime start = DateTime.Now;
// create the result DataTable
this.Results.Columns.Add("title", typeof(string));
this.Results.Columns.Add("content", typeof(string));
this.Results.Columns.Add("url", typeof(string));
if ((searchStr.IndexOf(" ") == -1)&&searchStr.Length>3)
{
List<string> resultList = Sj110.Com.Chinese.Tokenizer.Tokenize(searchStr);
StringBuilder sb = new StringBuilder();
foreach (string result in resultList)
{
bool bStop=false;
foreach (string stop in m_stopWords)
if (result == stop)
{
bStop = true;
break;
}
if (bStop == false)
{
sb.Append(result);
sb.Append(" ");
}
//sb.AppendFormat("{0} ", result);
}
sb.Remove(sb.Length - 1, 1);
searchStr = sb.ToString();
}
try
{
string[] fields = { "content", "title" };
//Hits h = searcher.search(searchStr, fields, prefix);
//Hits h = searcher.search(searchStr, "content");
Hits h = searcher.search(searchStr, prefix);
//this.m_total = h.Length();
this.m_total = GetValidLength(h);
// initialize startAt
this.m_startAt = initStartAt();
// how many items we should show - less than defined at the end of the results
int resultsCount = smallerOf(m_total, this.m_maxResults + this.m_startAt);
// create highlighter
if (h.Length() == 0)
{
DataRow row = this.Results.NewRow();
row["title"] = "您查询的关键字<font color=CC0033>" + searchStr + "</font>暂无结果。<br><br>提示:多个关键字之间请加空格。“<font color=black>公交 线路</font>”比“<font color=black>公交线路</font>”更容易搜到结果。";
row["url"] = "default.aspx";
this.Results.Rows.Add(row);
return;
}
for (int i = m_startAt; i < resultsCount; i++)
{
Document doc = h.Doc(i);
string url = doc.Get("url");
//if (url == m_oldUrl||url.EndsWith("/"))
if (m_oldUrls.CheckRepeatUrl(url) || url.EndsWith("/"))
{
m_invalidCount++;
resultsCount++;
continue;
}
//m_oldUrl = url;
string content = doc.Get("content");
string title = doc.Get("title");
if (title.Trim() == "") title = "无标题";
String[] searchArr = searchStr.Split(' ');
//startAt = content.IndexOf(searchArr[0]);
//startAt = startAt - 20;
//startAt = (startAt < 0 ? 0 : startAt);
//len = (startAt + 255 > content.Length ? content.Length - startAt : 255);
//content = content.Substring(startAt, len);
content = GetBestFragments(content, searchArr);
content = Hilighter(content, searchArr);
title = Hilighter(title, searchArr);
DataRow row = this.Results.NewRow();
row["title"] = title;
row["content"] = content;
row["url"] = url;
this.Results.Rows.Add(row);
}
// result information
this.m_duration = DateTime.Now - start;
this.m_fromItem = this.m_startAt + 1;
this.m_toItem = smallerOf(this.m_startAt + m_maxResults, m_total);
}
catch (Exception ex)
{
Console.WriteLine(ex.Message);
//throw;
return;
}
}