一.搜索引擎的原理
1.网络中爬取数据
2.将数据保存下来
3.对数据进行清洗(筛选)
4.对需要的数据建立索引
5.用户搜索获得需要的数据
二.倒排索引技术
1.将所有的词切开,对每个词进行编号.
2.把每个词在哪些文档中出现过记录下来.
3.用户搜索某个关键词,找到对应的关键词,取出该词对应的DocId
4.根据DocId找到相应的内容,返回给用户
三.lucene简单入门
1.创建项目,导包
<dependencies>
<!-- Junit单元测试 -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
<!-- lucene核心库 -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>4.10.2</version>
</dependency>
<!-- Lucene的查询解析器 -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>
<version>4.10.2</version>
</dependency>
<!-- lucene的默认分词器库 -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<version>4.10.2</version>
</dependency>
<!-- lucene的高亮显示 -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-highlighter</artifactId>
<version>4.10.2</version>
</dependency>
</dependencies>
<build>
<plugins>
<!-- java编译插件 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.2</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
</plugins>
</build>
2.向索引库中写入数据
@Test
public void testName() throws Exception {
// 第二步 获得Directory
FSDirectory fsDirectory = FSDirectory.open(new File("C:\\index"));
// 第四步,没有分词器没,创建一个
StandardAnalyzer analyzer = new StandardAnalyzer();
// 第三步 获得IndexWriterConfig 需要两个参数 第一个是 版本号,第二个是分词器
IndexWriterConfig config = new IndexWriterConfig(Version.LATEST, analyzer);
// 第一步.获得索引写入器,需要传入两个参数,第一个Directory 第二个IndexWriterConfig
IndexWriter indexWriter = new IndexWriter(fsDirectory, config);
//第六步 创建document
Document document = new Document();
document.add(new LongField("id", 1L, Store.YES));
document.add(new StringField("title", "谷歌之父跳槽了", Store.YES));
document.add(new TextField("content", "谷歌之父近日跳槽", Store.YES));
//第五步使用写入器 往 写入数据 ,查询源码得知需要一个IndexableField 可以创建document(实现IndexableField)
indexWriter.addDocument(document);
indexWriter.commit();
indexWriter.close();
}
3.通过可视化工具查看数据已经被导入打lucene中
4.注意事项
4.1,在图中我们可以发现只有TextField做了分词,其他的只建立了索引.因此我们需要注意:
在创建索引的时候:
DoubleField、FloatField、IntField、LongField、StringField、TextField这些子类一定会被创建索引。但是不一定会被存储到文档列表。要通过构造函数中的参数Store来指定:如果Store.YES代表存储,Store.NO代表不存储
在分词的时候:
TextField即创建索引,又会被分词。StringField等会创建索引,但是不会被分词。 如果不分词,会造成整个字段作为一个词条,除非用户完全匹配,否则搜索不到.
在存储的时候:
如果我们的数据需要存储,就需要设置Store.YES,如果不需要存储,Store.NO
StoreField一定会被存储,但是一定不创建索引 StoredField可以创建各种数据类型的字段:
4.2通过案例查看
/**
* 分词不分词,索不索引,存不存储
*/
@Test
public void testName2() throws Exception {
//第二步 FSDirectory
FSDirectory directory = FSDirectory.open(new File("c:\\index"));
//第三部 IndexWriterConfig 第一个版本号 第二个是分词器
StandardAnalyzer analyzer = new StandardAnalyzer();
IndexWriterConfig config = new IndexWriterConfig(Version.LATEST, analyzer);
//第一步 获取写入器 两个参数 第一个存储,第二个是题词
IndexWriter indexWriter = new IndexWriter(directory, config);
Document document = new Document();
document.add(new LongField("id", 1l, Store.YES));
document.add(new TextField("title", "分不分词", Store.YES) );
document.add(new StoredField("content", "我只会存储,既不建立索引,也不会分词"));
indexWriter.addDocument(document);
indexWriter.commit();
indexWriter.close();
}
4.3查看结果
4.4 同时建立一批数据
/**
* 同时建立一批索引到索引库中
*/
@Test
public void testName3() throws Exception {
// 第二步FSDirectory
FSDirectory directory = FSDirectory.open(new File("c:\\index"));
// 第四步 分词器
StandardAnalyzer analyzer = new StandardAnalyzer();
// 第三步IndexWriterConfig两个版本和分词器
IndexWriterConfig config = new IndexWriterConfig(Version.LATEST, analyzer);
// 第一步获取索引写入器,需要传入两个参数Directory IndexWriterConfig
IndexWriter indexWriter = new IndexWriter(directory, config);
List<Document> arrayList = new ArrayList<Document>();
Document document = new Document();
document.add(new LongField("id", 3L, Store.YES));
document.add(new StringField("title", "标题一", Store.NO));
document.add(new TextField("content", "内容一", Store.YES));
Document document2 = new Document();
document2.add(new LongField("id", 4L, Store.YES));
document2.add(new StringField("title", "标题二", Store.YES));
document2.add(new TextField("content", "美容二", Store.YES));
Document document3 = new Document();
document3.add(new LongField("id", 5L, Store.YES));
document3.add(new StringField("title", "第三个标题", Store.YES));
document3.add(new TextField("content", "第三个的内容", Store.YES));
arrayList.add(document);
arrayList.add(document2);
arrayList.add(document3);
// 第五步 写入数据
indexWriter.addDocuments(arrayList);
indexWriter.commit();
indexWriter.close();
}
四.通过ik分词器来实现中文分词
/**
* ik分词器
*/
@Test
public void testName4() throws Exception {
IndexWriterConfig writerConfig = new IndexWriterConfig(Version.LATEST, new IKAnalyzer());
//获取写入器
IndexWriter indexWriter = new IndexWriter(FSDirectory.open(new File("c:\\index")), writerConfig);
Document document = new Document();
document.add(new LongField("id", 1L, Store.YES));
document.add(new TextField("title", "用了中文分词器的标题", Store.YES));
document.add(new TextField("content", "我是内容使用了新版分词器的", Store.YES));
indexWriter.addDocument(document);
indexWriter.commit();
indexWriter.close();
}
五.lucene的查询
/**
* 开始进行查询
*/
@Test
public void testName6() throws Exception {
//第二步 DirectoryReader设置查询地址
DirectoryReader directoryReader = DirectoryReader.open(FSDirectory.open(new File("c:\\index")));
//第一步 创建查询器 需要传入参数从哪里获取
IndexSearcher indexSearcher = new IndexSearcher(directoryReader);
//第四步 使用查询解析器 设置查询内容 以及分割的格式
QueryParser queryParser = new QueryParser("content", new IKAnalyzer());
Query parse = queryParser.parse("天气");
//第三步 获取查询内容以及返回条数
TopDocs topDocs = indexSearcher.search(parse, 20);
//第五步 获取所有的文档
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
for (ScoreDoc scoreDoc : scoreDocs) {
//第六步获得文档的ID
int doc = scoreDoc.doc;
Document document = indexSearcher.doc(doc);
System.out.println(document.get("id"));
System.out.println(document.get("title"));
System.out.println(document.get("content"));
}
}
5.2第二种查询
按照词条来进行查询TermQuery
/**
* 按照词条来进行查询TermQuery
*/
@Test
public void testName7() throws Exception {
// 第二步设置查询的路劲
DirectoryReader directoryReader = DirectoryReader.open(FSDirectory.open(new File("c:\\index")));
// 第一步 设置查询器 从哪里查询
IndexSearcher indexSearcher = new IndexSearcher(directoryReader);
// 第四步 设置查询的内容
TermQuery termQuery = new TermQuery(new Term("content", "天气"));
// 第三步 获取查询的内容以及返回条数
TopDocs topDocs = indexSearcher.search(termQuery, 30);
// 第四步 获得对象数组
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
// 遍历
for (ScoreDoc scoreDoc : scoreDocs) {
// 获得编号
int doc = scoreDoc.doc;
Document document = indexSearcher.doc(doc);
System.out.println(document.get("id"));
System.out.println(document.get("title"));
System.out.println(document.get("content"));
}
}
5.3第三种查询 通配符的查询WildcardQuery
/**
* 通配符的查询WildcardQuery
*/
@Test
public void testName8() throws Exception {
// 第二步 设置查询地址
DirectoryReader directoryReader = DirectoryReader.open(FSDirectory.open(new File("c:\\index")));
// 第一步 获取查询器 参数 查询地址
IndexSearcher indexSearcher = new IndexSearcher(directoryReader);
// 第四步 设置条件
WildcardQuery wildcardQuery = new WildcardQuery(new Term("content", "*天*"));
// 第三步 获取查询内容及条数(参数1:,参数2:多少条)
TopDocs topDocs = indexSearcher.search(wildcardQuery, 20);
// 第五步 获得数组
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
for (ScoreDoc scoreDoc : scoreDocs) {
// 第六步 返回文档id
int doc = scoreDoc.doc;
Document document = indexSearcher.doc(doc);
System.out.println(document.get("id"));
System.out.println(document.get("title"));
System.out.println(document.get("content"));
}
indexSearcher.getIndexReader().close();
}
5.4 第四种查询,模糊查询FuzzyQuery
/**
* 第四种查询,模糊查询FuzzyQuery
*/
@Test
public void testName9() throws Exception {
IndexSearcher indexSearcher = new IndexSearcher(DirectoryReader.open(FSDirectory.open(new File("c:\\index"))));
FuzzyQuery fuzzyQuery = new FuzzyQuery(new Term("content", "ilxu"), 2);
TopDocs topDocs = indexSearcher.search(fuzzyQuery, 20);
ScoreDoc[] docs = topDocs.scoreDocs;
for (ScoreDoc scoreDoc : docs) {
int doc = scoreDoc.doc;
Document document = indexSearcher.doc(doc);
System.out.println(document.get("id"));
System.out.println(document.get("title"));
System.out.println(document.get("content"));
}
indexSearcher.getIndexReader().close();
}
5.5 第五种查询 数值范围查询==>NumericRangeQuery
/**
* 第五种查询 数值范围查询==>NumericRangeQuery
*/
@Test
public void testName10() throws Exception {
IndexSearcher indexSearcher = new IndexSearcher(DirectoryReader.open(FSDirectory.open(new File("c:\\index"))));
NumericRangeQuery<Long> newLongRange = NumericRangeQuery.newLongRange("id", 1L, 4l, true, false);
TopDocs search = indexSearcher.search(newLongRange, 20);
ScoreDoc[] scoreDocs = search.scoreDocs;
for (ScoreDoc scoreDoc : scoreDocs) {
int doc = scoreDoc.doc;
Document document = indexSearcher.doc(doc);
System.out.println(document.get("id"));
System.out.println(document.get("title"));
System.out.println(document.get("content"));
}
indexSearcher.getIndexReader().close();
}
5.6 第六种多条件的查询 NumericRangeQuery BooleanQuery
/**
* 第六种多条件的查询 NumericRangeQuery BooleanQuery
*/
@Test
public void testName11() throws Exception {
IndexSearcher indexSearcher = new IndexSearcher(DirectoryReader.open(FSDirectory.open(new File("c:\\index"))));
NumericRangeQuery<Long> newLongRange = NumericRangeQuery.newLongRange("id", 1L, 3L, true, true);
NumericRangeQuery<Long> newLongRange2 = NumericRangeQuery.newLongRange("id", 2L, 4L, true, true);
BooleanQuery booleanQuery = new BooleanQuery();
booleanQuery.add(newLongRange, Occur.MUST_NOT);
booleanQuery.add(newLongRange2, Occur.SHOULD);
TopDocs search = indexSearcher.search(booleanQuery, 20);
ScoreDoc[] scoreDocs = search.scoreDocs;
for (ScoreDoc scoreDoc : scoreDocs) {
int doc = scoreDoc.doc;
Document document = indexSearcher.doc(doc);
System.out.println(document.get("id"));
System.out.println(document.get("title"));
System.out.println(document.get("content"));
}
indexSearcher.getIndexReader().close();
}
六.索引修改
/**
* 索引修改
*/
@Test
public void testName12() throws Exception {
IndexWriter indexWriter = new IndexWriter(FSDirectory.open(new File("c:\\index")),
new IndexWriterConfig(Version.LATEST, new IKAnalyzer()));
Term term = new Term("content", "地方");
Document document = new Document();
document.add(new LongField("id", 2L, Store.YES));
document.add(new TextField("title", "修改天气", Store.YES));
document.add(new TextField("content", "天气的内容被修改", Store.YES));
indexWriter.updateDocument(term, document);
indexWriter.commit();
indexWriter.close();
}
七.删除数据库
/**
* 删除数据库
*/
@Test
public void testName13() throws Exception {
IndexWriter indexWriter = new IndexWriter(FSDirectory.open(new File("c:\\index")),
new IndexWriterConfig(Version.LATEST, new IKAnalyzer()));
// indexWriter.deleteDocuments(new Term("content","碉堡"));
indexWriter.deleteAll();
indexWriter.commit();
indexWriter.close();
}
八.高亮
/**
* 高亮
*/
@Test
public void testName14() throws Exception {
IndexSearcher indexSearcher = new IndexSearcher(DirectoryReader.open(FSDirectory.open(new File("c:\\index"))));
SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<em color='red'>", "</em>");
// indexSearcher.search(query, n)
TermQuery termQuery = new TermQuery(new Term("content", "天气"));
QueryTermScorer queryTermScorer = new QueryTermScorer(termQuery);
Highlighter highlighter = new Highlighter(simpleHTMLFormatter, queryTermScorer);
TopDocs search = indexSearcher.search(termQuery, 20);
ScoreDoc[] scoreDocs = search.scoreDocs;
for (ScoreDoc scoreDoc : scoreDocs) {
int doc = scoreDoc.doc;
Document doc2 = indexSearcher.doc(doc);
//第四步:通过highlighter 对我们查询的结果进行高亮显示
if(null !=doc2.get("content")){
String bestFragment = highlighter.getBestFragment(new IKAnalyzer(), "content", doc2.get("content"));
System.out.println(bestFragment);
}
}
indexSearcher.getIndexReader().close();
}
九.lucene的排序
/**
* lucene的排序
*/
@Test
public void testName15() throws Exception {
IndexSearcher indexSearcher = new IndexSearcher(DirectoryReader.open(FSDirectory.open(new File("c:\\index"))));
NumericRangeQuery<Long> newLongRange = NumericRangeQuery.newLongRange("id", 1L, 4L, true, true);
// SortField 跟三个参数,第一个是排序的字段,第二个是该字段的类型,第三个,返回的结果是否要反转,默认是升序排列,如果true表示反转就变成了降序排列
SortField sortField = new SortField("id", Type.LONG, false);
Sort sort = new Sort(sortField);
TopFieldDocs fieldDocs = indexSearcher.search(newLongRange, 20,sort);
ScoreDoc[] docs = fieldDocs.scoreDocs;
for (ScoreDoc scoreDoc : docs) {
//获取文档的id
int doc = scoreDoc.doc;
Document doc2 = indexSearcher.doc(doc);
System.out.println(doc2.get("id"));
System.out.println(doc2.get("title"));
System.out.println(doc2.get("content"));
}
indexSearcher.getIndexReader().close();
}
10.手动分页
/**
* 分页
*/
@Test
public void testName16() throws Exception {
int pageNum = 1;
int pageSize = 2;
int start = (pageNum-1) * pageSize; //计算
IndexSearcher indexSearcher = new IndexSearcher(DirectoryReader.open(FSDirectory.open(new File("c:\\index"))));
NumericRangeQuery<Long> newLongRange = NumericRangeQuery.newLongRange("id", 1L, 4L, true, true);
SortField sortField = new SortField("id", Type.LONG,true);
Sort sort = new Sort(sortField);
TopFieldDocs search = indexSearcher.search(newLongRange,pageNum*pageSize, sort);
ScoreDoc[] scoreDocs = search.scoreDocs;
for (int i = start; i < start+pageSize; i++) {
ScoreDoc scoreDoc = scoreDocs[i];
int doc = scoreDoc.doc;
Document doc2 = indexSearcher.doc(doc);
System.out.println(doc2.get("id"));
System.out.println(doc2.get("title"));
System.out.println(doc2.get("content"));
}
indexSearcher.getIndexReader().close();
}