索引文件可以理解为一个数据库,既然是数据库那么就因该包含增删改查,在此我们先定义两个接口IndexInterfac和QueryInterface分别表示索引相关的操作和查询相关的操作
IndexInterface代码如下:
import java.io.IOException;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.Query;
public interface IndexInterface {
void create(Long id, String title) throws IOException;
void updateOne(Long id, String title) throws IOException;
/**
* delete all index from lucene
* @throws IOException
*/
void delete() throws IOException;
/**
* This way will delete index which hit key in field
* @param field
* @param key
* @throws IOException
* @throws ParseException
*/
void delete(String field, String key) throws IOException, ParseException;
void delete(Query[] querys) throws IOException;
void delete(Term term) throws IOException;
void delete(Term[] terms) throws IOException;
}
创建索引的时候可以分两种情况:丢弃现有索引重新创建索引、在现有索引上增加新索引,增加会引起重复,之后我会介绍如何防止重复添加
QueryInterface代码:import java.io.IOException; import org.apache.lucene.queryparser.classic.ParseException; public interface QueryInterface {
Object search(String key, String field) throws IOException, ParseException; // 返回值需要自己封装 }
索引实现类:import java.io.File; import java.io.IOException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.Query; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.wltea.analyzer.lucene.IKAnalyzer; import com.hao.search.service.IndexInterface; public class IndexService implements IndexInterface { private Analyzer analyzer = new IKAnalyzer(); public IndexService(String path) { this.path = path; } private String path; // 索引文件目录 public void create(Long id, String title) throws IOException { Directory dir = FSDirectory.open(new File(path)); IndexWriterConfig writerConfig = new IndexWriterConfig(Version.LUCENE_47, analyzer); IndexWriter indexWriter = new IndexWriter(dir, writerConfig); Document document = new Document(); document.add(new StringField("id", id.toString(), Field.Store.YES)); // 业务记录id,唯一,例如商品编号 document.add(new TextField("title", title, Field.Store.YES)); // 待分词搜索字段 indexWriter.addDocument(document); indexWriter.close(); } // public void append(Long id, String title) throws IOException { Directory dir = FSDirectory.open(new File(path)); IndexWriterConfig writerConfig = new IndexWriterConfig(Version.LUCENE_47, analyzer); IndexWriter indexWriter = new IndexWriter(dir, writerConfig); Document document = new Document(); document.add(new StringField("id", id.toString(), Field.Store.YES)); document.add(new TextField("title", title, Field.Store.YES)); indexWriter.addDocument(document); indexWriter.close(); } public void delete() throws IOException { Directory dir = FSDirectory.open(new File(path)); IndexWriterConfig writerConfig = new IndexWriterConfig(Version.LUCENE_47, analyzer); IndexWriter indexWriter = new IndexWriter(dir, writerConfig); indexWriter.deleteAll(); // 删除全部索引 indexWriter.close(); } public void delete(Long id) throws IOException { delete(new Term("id", id.toString())); // 这里是按照id进行删除 } public void delete(String field, String key) throws IOException, ParseException { Directory dir = FSDirectory.open(new File(path)); IndexWriterConfig writerConfig = new IndexWriterConfig(Version.LUCENE_47, analyzer); IndexWriter indexWriter = new IndexWriter(dir, writerConfig); QueryParser parser = new QueryParser(Version.LUCENE_47, field, analyzer); Query query = parser.parse(key); indexWriter.deleteDocuments(query); indexWriter.close(); } public void delete(Query[] querys) throws IOException { // TODO Auto-generated method stub } public void delete(Term term) throws IOException { Directory dir = FSDirectory.open(new File(path)); IndexWriterConfig writerConfig = new IndexWriterConfig(Version.LUCENE_47, analyzer); IndexWriter indexWriter = new IndexWriter(dir, writerConfig); indexWriter.deleteDocuments(term); indexWriter.close(); } public void delete(Term[] terms) throws IOException { // TODO Auto-generated method stub } public void updateOne(Long id, String title) throws IOException { Directory dir = FSDirectory.open(new File(path)); IndexWriterConfig writerConfig = new IndexWriterConfig(Version.LUCENE_47, analyzer); IndexWriter indexWriter = new IndexWriter(dir, writerConfig); Document document = new Document(); document.add(new StringField("id", id.toString(), Field.Store.YES)); document.add(new TextField("title", title, Field.Store.YES)); indexWriter.updateDocument(new Term("id", id.toString()), document);// 按照id进行更新,若不存在就会新创建,建议使用此方法追加索引 indexWriter.close(); } }
StringField 与 TextField都是Field的子类,两者都会被建立索引,但StringField不会被分词,例如商品系统中图片地址,商品编号等
查询实现类:
测试类import java.io.File; import java.io.IOException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.highlight.Fragmenter; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.InvalidTokenOffsetsException; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.apache.lucene.search.highlight.SimpleSpanFragmenter; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.wltea.analyzer.lucene.IKAnalyzer; import com.hao.search.service.QueryInterface; public class QueryService implements QueryInterface{ Analyzer analyzer = new IKAnalyzer(); public QueryService(String path) { this.path = path; } private String path = "d:/test/lucene"; public Object search(String key, String field) throws IOException, ParseException { IndexSearcher searcher = new IndexSearcher(DirectoryReader.open(FSDirectory.open(new File("d:/test/lucene")))); QueryParser queryParser = new QueryParser(Version.LUCENE_47, field, analyzer); queryParser.setDefaultOperator(QueryParser.Operator.AND); // 设置分词结果集之间的关系,这里设置为全包含 Query query = queryParser.parse(key); QueryScorer scorer = new QueryScorer(query); Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);// 得到得分的片段,就是得到一段包含所查询的关键字的摘要 SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter( "<b><font color='red'>", "</font></b>");// 对查询的数据格式化;无参构造器的默认是将关键字加粗 Highlighter highlighter = new Highlighter(simpleHTMLFormatter, scorer);// 根据得分和格式化 highlighter.setTextFragmenter(fragmenter);// 设置成高亮 ScoreDoc after = null; TopDocs topDocs = searcher.searchAfter(after, query, 10); ScoreDoc[] pageDocs = topDocs.scoreDocs; for (ScoreDoc doc : pageDocs) { Document d = searcher.doc(doc.doc); try { String str = highlighter.getBestFragment(analyzer, field, d.get(field)) ; // 对命中关键词加高亮,此处可以封装成对象列表返回 System.out.println(str); } catch (InvalidTokenOffsetsException e) { e.printStackTrace(); } after = pageDocs[topDocs.scoreDocs.length - 1]; System.out.println("id: "+ d.get(field) +"内容:"+d.get("title")); } return null; } }
输出结果:import java.io.IOException; import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.classic.ParseException; import com.hao.search.service.IndexInterface; import com.hao.search.service.QueryInterface; import com.hao.search.serviceImpl.IndexService; import com.hao.search.serviceImpl.QueryService; /** * Hello world! * */ public class App { static String path = "d:/test/lucene"; public static void main( String[] args ) { // delete(); // create(); // 先创建索引,在执行查询 query("入门", "title"); // delete(3l); } public static void delete(Long id) { IndexInterface index = new IndexService(path); try { index.delete(new Term("id", id.toString())); System.out.println("delete success"); } catch (IOException e1) { e1.printStackTrace(); System.out.println("delete errro"); } } public static void delete() { IndexInterface index = new IndexService(path); try { index.delete(); System.out.println("delete success"); } catch (IOException e1) { e1.printStackTrace(); System.out.println("delete errro"); } } public static Object query(String id, String field) { QueryInterface query = new QueryService(path); try { query.search(id, field); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (ParseException e) { // TODO Auto-generated catch block e.printStackTrace(); } return null; } public static void create() { IndexInterface index = new IndexService(path); try { index.updateOne(1l, "字符串和三元运算符字符串相加优先级的问题"); index.updateOne(2l, "eclipse下maven更新的意外bug总结"); index.updateOne(3l, "如何使用搜索技巧来成为一名高效的程序员"); index.updateOne(4l, "入门级Demo,创建索引和查询高亮显示"); index.updateOne(5l, "批量爬QQ用户信息,利用QQ查询功能进行获取QQ用户信息"); index.updateOne(6l, "Python爬虫开发(三):数据存储以及多线程"); index.updateOne(7l, "Python爬虫开发(一):零基础入门"); index.updateOne(8l, "Python爬虫开发(二):整站爬虫与Web挖掘"); index.updateOne(9l, "Python爬虫开发(三-续):快速线程池爬虫"); index.updateOne(10l, "Python爬虫开发(五):反爬虫措施以及爬虫编写注意事项"); index.updateOne(11l, "Python爬虫开发(四):动态加载页面的解决方案与爬虫代理"); index.updateOne(12l, "挖洞经验 | 价值1万美金的谷歌内部主机信息泄露漏洞"); index.updateOne(13l, "如何确认Google用户的具体电子邮件地址(已提交Google漏洞奖励计划)"); index.updateOne(14l, "欧洲国家电网的噩梦:攻击太阳能板就能导致大规模停电?"); index.updateOne(15l, "Java反序列化危机已过,这次来的是.Net反序列化漏洞"); } catch (IOException e) { e.printStackTrace(); } } }
<b><font color='red'>入门</font></b>级Demo,创建索引和查询高亮显示 id: 入门级Demo,创建索引和查询高亮显示内容:入门级Demo,创建索引和查询高亮显示 Python爬虫开发(一):零基础<b><font color='red'>入门</font></b> id: Python爬虫开发(一):零基础入门内容:Python爬虫开发(一):零基础入门