近期做项目中有用到过Lucene,那个模块是由一位前端大神负责的,空闲时间我也做了个关于Lucene做全文检索的Demo,记录下来,方便以后学习。
关于Lucene的原理,网上有长篇大论的文章,有兴趣的话可以去阅读,再次我就直奔主题,在代码中分析其原理。
1、创建索引(此处我用的是盘古分词)
#region 创建索引 void CreateIndex(object sender, EventArgs e)
/// <summary>
/// 创建索引
/// </summary>
/// <param name="sender"></param>
/// <param name="e"></param>
private void CreateIndex(object sender, EventArgs e)
{
//索引存放的物理路径
//this.CreateDirectory(); //给 indexPath 赋值
FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NativeFSLockFactory());
bool isUpdate = IndexReader.IndexExists(directory); //判断索引库文件夹存在并且存在索引库特征文件
if (isUpdate)
{
//同时只能有一段代码对索引库进行写操作!当使用IndexWriter打开directory的时候会自动给索引库上锁。!!!
//如果索引目录被锁定(比如索引过程中程序异常退出),则首先解锁
if (IndexWriter.IsLocked(directory)) //如果索引库文件被锁定了 解锁
{
IndexWriter.Unlock(directory);
}
}
//IndexWriter writer = new IndexWriter(indexPath, new PanGuAnalyzer(), !isUpdate, Lucene.Net.Index.IndexWriter.MaxFieldLength.UNLIMITED); //该方法已过时。
IndexWriter writer = new IndexWriter(directory, new PanGuAnalyzer(), !isUpdate, Lucene.Net.Index.IndexWriter.MaxFieldLength.UNLIMITED);
IEnumerable<Story> list = bllHelper.GetAllStory();
foreach (Story story in list)
{
writer.DeleteDocuments(new Term("ID", story.ID.ToString()));
Document document = new Document(); //一篇文章,一部小说
//要进行全文检索的字段要设置 Field.Index.ANALYZED !!!!!!!!!!!!!!!!!!!!!!!!!!
document.Add(new Field("ID", story.ID.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
document.Add(new Field("Title", story.Title, Field.Store.YES, Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.WITH_POSITIONS_OFFSETS));
document.Add(new Field("Author", story.Author, Field.Store.YES, Field.Index.NOT_ANALYZED));
document.Add(new Field("Content", story.Content, Field.Store.YES, Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.WITH_POSITIONS_OFFSETS));
document.Add(new Field("URL", story.URL, Field.Store.YES, Field.Index.NOT_ANALYZED));
writer.AddDocument(document);
}
writer.Close();
directory.Close();
}
#endregion
2.接下来就是搜索了
#region 搜索 IEnumerable<Story> Search(string keyWord)
/// <summary>
/// 搜索
/// </summary>
/// <param name="keyWords">关键字</param>
private IEnumerable<Story> Search(string keyWord)
{
FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NoLockFactory());
IndexReader reader = IndexReader.Open(directory, true);
IndexSearcher searcher = new IndexSearcher(reader);
//多条件查询
//搜索条件
PhraseQuery queryTitle = new PhraseQuery();
//把用户输入的“北京是首都”分词为“北京 是 首都”三个词,然后添加查询条件
foreach (string word in CommonHelper.SplitWords(keyWord))
{
queryTitle.Add(new Term("Title", word));
}
queryTitle.SetSlop(100); //多个查询条件的词之间的最大距离。在文章中相隔太远一般也就无意义
//搜索条件
PhraseQuery queryContent = new PhraseQuery();
//把用户输入的“北京是首都”分词为“北京 是 首都”三个词,然后添加查询条件
foreach (string word in CommonHelper.SplitWords(keyWord))
{
queryContent.Add(new Term("Content", word));
}
queryContent.SetSlop(100);
//用BooleanQuery把多个查询条件拼接起来成为一个大的查询条件
BooleanQuery query = new BooleanQuery();
query.Add(queryTitle, BooleanClause.Occur.SHOULD);//可以有
query.Add(queryContent, BooleanClause.Occur.SHOULD);//可以有
#if !notes
//组合关系代表的意思如下:
//1、MUST和MUST表示“与”的关系,即“并集”。
//2、MUST和MUST_NOT前者包含后者不包含。
//3、MUST_NOT和MUST_NOT没意义
//4、SHOULD与MUST表示MUST,SHOULD失去意义;
//5、SHOUlD与MUST_NOT相当于MUST与MUST_NOT。
//6、SHOULD与SHOULD表示“或”的概念。
#endif
//create 一个存储查询结果的容器
TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true);
searcher.Search(query, null, collector);
ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs; //得到所有查询结果中的文档
List<Story> list = new List<Story>();
foreach (ScoreDoc doc in docs)
{
int docID = doc.doc; //得到查询结果文档的id(Lucene内部分配的id)
Document document = searcher.Doc(docID); //根据ID找到对应的Document
Story story = new Story();
story.ID = Convert.ToInt32(document.Get("ID"));
story.Title = CommonHelper.Highlight(keyWord, document.Get("Title"));
story.Author = document.Get("Author");
story.Content = CommonHelper.Highlight(keyWord, document.Get("Content"));
//story.Content = document.Get("Content");
story.URL = document.Get("URL");
list.Add(story);
}
return list;
}
#endregion
3.帮助类文件
3.1 BusinessHelper类
#region 根据ID获取小说 +Story GetStoryById(int id)
/// <summary>
/// 根据ID获取小说
/// </summary>
/// <param name="id">ID</param>
/// <returns></returns>
public Story GetStoryById(int id)
{
string sql = "SELECT * FROM Story nolock WHERE Id = @Id";
using (SqlDataReader reader = SqlHelper.ExecuteDataReader(sql, new SqlParameter("@Id", id)))
{
if (reader.Read())
{
return ToModel(reader);
}
else
{
return null;
}
}
}
#endregion
#region 获取所有的小说 +IEnumerable<Story> GetAllStory()
/// <summary>
/// 获取所有的小说
/// </summary>
/// <returns></returns>
public IEnumerable<Story> GetAllStory()
{
var list = new List<Story>();
string sql = "SELECT * FROM Story nolock";
using (SqlDataReader reader = SqlHelper.ExecuteDataReader(sql))
{
while (reader.Read())
{
list.Add(ToModel(reader));
}
}
return list;
}
#endregion
#region 把SqlDataReader转换成实体 Story ToModel(SqlDataReader reader)
/// <summary>
/// 把SqlDataReader转换成实体
/// </summary>
/// <param name="reader"></param>
/// <returns></returns>
private Story ToModel(SqlDataReader reader)
{
Story story = new Story();
story.ID = (int)ToModelValue(reader, "Id");
story.Title = (string)ToModelValue(reader, "Title");
story.Author = (string)ToModelValue(reader, "Author");
story.Content = (string)ToModelValue(reader, "Content");
story.URL = (string)ToModelValue(reader, "URL");
return story;
}
#endregion
private object ToDBValue(object value)
{
if (value == null)
{
return DBNull.Value;
}
else
{
return value;
}
}
private object ToModelValue(SqlDataReader reader, string columnName)
{
if (reader.IsDBNull(reader.GetOrdinal(columnName)))
{
return null;
}
else
{
return reader[columnName];
}
}
3.2 CommonHelper类
/// <summary>
/// 把用户传入的字符串s分割成一个个的词
/// </summary>
/// <param name="s"></param>
/// <returns></returns>
public static string[] SplitWords(string s)
{
List<string> list = new List<string>();
Analyzer analyzer = new PanGuAnalyzer();
TokenStream tokenStream = analyzer.TokenStream("", new StringReader(s));
Lucene.Net.Analysis.Token token = null;
while ((token = tokenStream.Next()) != null) //Next继续分词,如果没有更多词,则返回null
{
list.Add(token.TermText());//得到分到的词
}
return list.ToArray();
}
public static string Highlight(string keyword, string content)
{
try
{
//创建HTMLFormatter,参数为高亮单词的前后缀
PanGu.HighLight.SimpleHTMLFormatter simpleHTMLFormatter =
new PanGu.HighLight.SimpleHTMLFormatter("<font color=\"red\"><b>", "</b></font>");
//创建 Highlighter ,输入HTMLFormatter 和 盘古分词对象Semgent
PanGu.HighLight.Highlighter highlighter = new PanGu.HighLight.Highlighter(simpleHTMLFormatter, new Segment());
//设置每个摘要段的字符数
highlighter.FragmentSize = 5000;
//获取最匹配的摘要段
string result = highlighter.GetBestFragment(keyword, content);
if (string.IsNullOrEmpty(result))
{
return content;
}
else
{
return result;
}
}
catch
{
return content;
}
}
3.3 SqlHelper 类
public static string CONNECTIONSTRING = ConfigurationManager.ConnectionStrings["connLuceneDB"].ConnectionString;
#region 执行查询方法 +static DataTable ExecuteDataTable(string sql)
/// <summary>
/// 执行查询方法
/// <para>返回DataTable</para>
/// </summary>
/// <param name="sql">sql语句</param>
/// <param name="list"></param>
public static DataTable ExecuteDataTable(string sql)
{
using (SqlConnection conn = new SqlConnection(SqlHelper.CONNECTIONSTRING))
{
conn.Open();
using (SqlCommand cmd = new SqlCommand(sql, conn))
{
SqlDataAdapter da = new SqlDataAdapter(cmd);
DataTable dt = new DataTable();
da.Fill(dt);
return dt;
}
};
}
#endregion
#region 执行查询方法,返回DataReader对象 +static SqlDataReader ExecuteDataReader(string cmdText,params SqlParameter[] parameters)
/// <summary>
/// 执行查询方法,返回DataReader对象
/// </summary>
/// <param name="cmdText"></param>
/// <param name="parameters"></param>
/// <returns></returns>
public static SqlDataReader ExecuteDataReader(string cmdText,
params SqlParameter[] parameters)
{
SqlConnection conn = new SqlConnection(CONNECTIONSTRING);
conn.Open();
using (SqlCommand cmd = conn.CreateCommand())
{
cmd.CommandText = cmdText;
cmd.Parameters.AddRange(parameters);
return cmd.ExecuteReader(CommandBehavior.CloseConnection);
}
}
#endregion
#region 执行 增、删、改 的方法 +static void ExecuteNonQuery(string sql, out bool flag)
/// <summary>
/// 执行 增、删、改 的方法
/// </summary>
/// <param name="sql">SQL语句</param>
/// <returns>返回执行结果 true OR false</returns>
public static bool ExecuteNonQuery(string sql)
{
var flag = false;
using (SqlConnection conn = new SqlConnection(SqlHelper.CONNECTIONSTRING))
{
conn.Open();
using (SqlCommand cmd = new SqlCommand(sql, conn))
{
flag = cmd.ExecuteNonQuery() > 0 ? true : false;
}
};
return flag;
}
#endregion
4.小说实体类
/// <summary>
/// 小说 实体类
/// </summary>
public class Story
{
/// <summary>
/// 小说编号
/// </summary>
public int ID { get; set; }
/// <summary>
/// 小说标题
/// </summary>
public string Title { get; set; }
/// <summary>
/// 作者
/// </summary>
public string Author { get; set; }
/// <summary>
/// 小说内容
/// </summary>
public string Content { get; set; }
/// <summary>
/// 小说在线阅读地址
/// </summary>
public string URL { get; set; }
}
5.前台
<form id="form1" runat="server" method="post">
<asp:TextBox ID="txtKW" runat="server" Width="291px"></asp:TextBox>
<asp:Button ID="btnSearch" runat="server" Text="搜索" onclick="btnSearch_Click" />
<asp:Button ID="btnCreateIndex" runat="server" Text="创建索引"
onclick="btnCreateIndex_Click"/>
<asp:GridView ID="gdvShowStory" runat="server" AutoGenerateColumns="False" CellPadding="4"
ForeColor="#333333" GridLines="None">
<AlternatingRowStyle BackColor="White" ForeColor="#284775" />
<Columns>
<asp:TemplateField HeaderStyle-Width="3%">
<HeaderTemplate>
编号
</HeaderTemplate>
<ItemTemplate>
<asp:Label ID="Label1" runat="server" Text='<%# Eval("ID") %>'></asp:Label>
</ItemTemplate>
</asp:TemplateField>
<asp:TemplateField HeaderStyle-Width="10%">
<HeaderTemplate>
标题
</HeaderTemplate>
<ItemTemplate>
<asp:Label ID="Label2" Text='<%# Eval("Title") %>' runat="server"></asp:Label>
</ItemTemplate>
</asp:TemplateField>
<asp:TemplateField HeaderStyle-Width="8%">
<HeaderTemplate>
作者
</HeaderTemplate>
<ItemTemplate>
<asp:Label ID="Label2" Text='<%# Eval("Author") %>' runat="server"></asp:Label>
</ItemTemplate>
</asp:TemplateField>
<asp:TemplateField HeaderStyle-Width="70%">
<HeaderTemplate>
内容
</HeaderTemplate>
<ItemTemplate>
<asp:Label ID="Label2" Text='<%# Eval("Content") %>' runat="server"></asp:Label>
</ItemTemplate>
</asp:TemplateField>
<asp:TemplateField HeaderStyle-Width="5%">
<HeaderTemplate>
操作
</HeaderTemplate>
<ItemTemplate>
<a href='<%#Eval("URL") %>'>在线阅读</a>
</ItemTemplate>
</asp:TemplateField>
</Columns>
<EditRowStyle BackColor="#999999" />
<FooterStyle BackColor="#5D7B9D" Font-Bold="True" ForeColor="White" />
<HeaderStyle BackColor="#5D7B9D" Font-Bold="True" ForeColor="White" />
<PagerStyle BackColor="#284775" ForeColor="White" HorizontalAlign="Center" />
<RowStyle BackColor="#F7F6F3" ForeColor="#333333" />
<SelectedRowStyle BackColor="#E2DED6" Font-Bold="True" ForeColor="#333333" />
<SortedAscendingCellStyle BackColor="#E9E7E2" />
<SortedAscendingHeaderStyle BackColor="#506C8C" />
<SortedDescendingCellStyle BackColor="#FFFDF8" />
<SortedDescendingHeaderStyle BackColor="#6F8DAE" />
</asp:GridView>
</form>
注:需要引入几个类库
OK,到此为止,一个简单的Demo出来了,看看效果吧:
* 源码下载地址:点我下载源码