lucence查询mysql_lucene爬数据库中的数据无非也是查询数据。所有我们用lucene搜索数据主要有下面几个步骤...

最新推荐文章于 2021-02-26 13:16:57 发布

weixin_39569112

最新推荐文章于 2021-02-26 13:16:57 发布

阅读量206

点赞数

文章标签： lucence查询mysql

本文链接：https://blog.csdn.net/weixin_39569112/article/details/113902184

版权

public void CreateFileIndex(String dir) {

try {

/* 这里放索引文件的位置 */

File indexDir = new File("c:\\" + dir); // 存放检索文件的路径

if (!indexDir.exists()) {

indexDir.mkdirs();

}

// 创建标准文本分析器，标准的是可以支持的中文的

Analyzer luceneAnalyzer = new StandardAnalyzer();

indexWriter = new IndexWriter(indexDir, luceneAnalyzer, true);

// 可以说是创建一个新的写入工具

// 第一个参数是要索引建立在哪个目录里

// 第二个参数是新建一个文本分析器,这里用的是标准的大家也可以自己写一个

// 第三个参数如果是true，在建立索引之前先将c: \\index目录清空

indexWriter.setMaxFieldLength(100000);

indexWriter.optimize();

} catch (IOException e) {

System.out.println("建立索引失败!!!");

e.printStackTrace();

}

三， // 添加数据到索引里去-----------------3

public String createIndex(String title, String url, String content) {

try {

// 增加document到索引去

// document对象，相当于数据库中一条记录

Document document = new Document();

// Field对象，相当于数据库中字段

Field FiledTitle = new Field("title", title, Field.Store.YES,

Field.Index.ANALYZED);// Field.Index.ANALYZED 这就能进行索引了, 如果设置为NO的话就不能检索

Field FiledContent = new Field("content", content, Field.Store.YES,

Field.Index.ANALYZED);

Field FieldBody = new Field("url", url, Field.Store.YES,

Field.Index.NO);

document.add(FieldBody);

document.add(FiledContent);

document.add(FiledTitle);

indexWriter.addDocument(document);

} catch (IOException e) {

e.printStackTrace();

return "建立索引失败!!!!!!";

}

return "建立索引成功!!!!!!!";

}

四， // 关闭索引================================== 4

public void close() throws IOException {

this.indexWriter.close(); //这里非常的重要，不关闭直接导致你的索引创建不成功

}

五， // 查询索引的方法 ===============================5

public ArrayList getQueryDate(String info)

throws CorruptIndexException, IOException,

org.apache.lucene.queryParser.ParseException {

ArrayList doc = new ArrayList();

String queryString = info;

// Hits hits = null;

// Query query = null;

// QueryParser qp = null;

// String dir = "c:\\hujiong"; // 一定要跟你建索引的位置一致

// // 建立索引检索对象

// IndexSearcher searcher = new IndexSearcher(dir);

// // 分词器

// Analyzer analyzer = new StandardAnalyzer();

// qp = new QueryParser("content", analyzer);// 这里上面只写了一个按Content查找.

// 下面添加的是title, 查找

// query = qp.parse(queryString);

// if (searcher != null) {

// hits = searcher.search(query);

// doc = new ArrayList();

// for (int i = 0; i < hits.length(); i++) {

// doc.add(hits.doc(i));

// }

IndexSearcher searcher = new IndexSearcher("c:\\hujiong");

Analyzer analyzer = new StandardAnalyzer();

Query query = null;

if (searcher != null) {

// 合并你搜索的字段, 增强你的搜索能力!!

BooleanClause.Occur[] clauses = { BooleanClause.Occur.SHOULD,

BooleanClause.Occur.SHOULD };

query = MultiFieldQueryParser.parse(queryString, new String[] {

"title", "content" }, clauses, analyzer); // 这里就是在两个范围内进行收索 , 不过这些索引的字段必须要在添加数据到索引的时候设置它

TopDocCollector collector = new TopDocCollector(5); // 设置返回的最大数目，就返回前100条

searcher.search(query, collector);

ScoreDoc[] hits1 = collector.topDocs().scoreDocs;

// 返回的结果他是一个数组

if (hits1.length > 0) {

for (int i = 0; i < hits1.length; i++) {

Document doc1 = searcher.doc(hits1[i].doc);

// 这是从这个返回的数组里面迭代每一个数据，它的值是Document

doc.add(doc1);

System.out.println(doc1.get("title") + "-----title");

System.out.println(doc1.get("content") + "-------content");

}

} else {

System.out.println("没有数据");

}

return doc;

}

//上面注释的一段代码是紧对单个field查询，下面是支持多个field查询

//上面的例子只需要改变第一步，就可以查多个表的数据，你只需在getDate(String sql)中的sql语句改变为你要查询的表的sql语句，

//该方法中

while (rs.next()) {

BaseItem i = new BaseItem();

i.setTitle(rs.getString("title")); // 对应你的Blog表里的title

i.setContent(rs.getString("content")); // 取表里的博客内容

i.setUr("SingleArticle_lucene.action?id=" + rs.getInt("blogId")); // 如 a. action ?id=8

item.add(i);

} // 把数据库里的数据取出来

也要和你查询的表的字段对应就可以。。。

package phz;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.queryParser.MultiFieldQueryParser;

import org.apache.lucene.search.BooleanClause;

import org.apache.lucene.search.Hits;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.MultiSearcher;

import org.apache.lucene.search.Query;

/**

* 这个实例包含了lucene所有核心用法

* @author panhuizi

public class LuceneTest {

public static void main(String[] args) {

try {

LuceneTest luceneTest = new LuceneTest();

// 创建索引

luceneTest.index();

// 在索引所在目录下搜索"中国金牌"

luceneTest.search("中国金牌");

} catch (Exception e) {

e.printStackTrace();

}

System.out.println("ok");

}

public void index() throws Exception {

/* 创建索引初始化，执行这些语句将创建或清空d:\\save\\目录下所有索引 */

IndexWriter writer1 = new IndexWriter("d:\\save\\",

new StandardAnalyzer(), true);

writer1.close();

* 往创建的初始化索引中添加索引内容，StandardAnalyzer表示用lucene自带的标准分词机制，

* false表示不覆盖原来该目录的索引，细心的读者可能已经发现，这句话和上面的那句就这个false不一样

IndexWriter writer2 = new IndexWriter("d:\\save\\",

new StandardAnalyzer(), false);

/* 创建一份文件 */

Document doc1 = new Document();

* 创建一个域ArticleTitle，并往这个域里面添加内容 "Field.Store.YES"表示域里面的内容将被存储到索引

* "Field.Index.TOKENIZED"表示域里面的内容将被索引，以便用来搜索

Field field1 = new Field("ArticleTitle", "北京2008年奥运会", Field.Store.YES,

Field.Index.TOKENIZED);

/* 往文件里添加这个域 */

doc1.add(field1);

/* 同理：创建另外一个域ArticleText，并往这个域里面添加内容 */

Field field2 = new Field("ArticleText", "这是一届创造奇迹、超越梦想的奥运会.......",

Field.Store.YES, Field.Index.TOKENIZED);

doc1.add(field2);

// 在这里还可以添加其他域

/* 添加这份文件到索引 */

writer2.addDocument(doc1);

/* 同理：创建第二份文件 */

Document doc2 = new Document();

field1 = new Field("ArticleTitle", "中国获得全球赞誉", Field.Store.YES,

Field.Index.TOKENIZED);

doc2.add(field1);

field2 = new Field("ArticleText", "中国所取得的金牌总数排行榜的榜首........",

Field.Store.YES, Field.Index.TOKENIZED);

doc2.add(field2);

writer2.addDocument(doc2);

// 在这里可以添加其他文件

/* 关闭 */

writer2.close();

}

public void search(String serchString) throws Exception {

/* 创建一个搜索，搜索刚才创建的d:\\save\\目录下的索引 */

IndexSearcher indexSearcher = new IndexSearcher("d:\\save\\");

/* 在这里我们只需要搜索一个目录 */

IndexSearcher indexSearchers[] = { indexSearcher };

/* 我们需要搜索两个域"ArticleTitle", "ArticleText"里面的内容 */

String[] fields = { "ArticleTitle", "ArticleText" };

/* 下面这个表示要同时搜索这两个域，而且只要一个域里面有满足我们搜索的内容就行 */

BooleanClause.Occur[] clauses = { BooleanClause.Occur.SHOULD,

BooleanClause.Occur.SHOULD };

* MultiFieldQueryParser表示多个域解析，

* 同时可以解析含空格的字符串，如果我们搜索"中国金牌"，根据前面的索引，显然搜到的是第二份文件

Query query = MultiFieldQueryParser.parse(serchString, fields, clauses,

new StandardAnalyzer());

/* Multisearcher表示多目录搜索，在这里我们只有一个目录 */

MultiSearcher searcher = new MultiSearcher(indexSearchers);

/* 开始搜索 */

Hits h = searcher.search(query);

/* 把搜索出来的所有文件打印出来 */

for (int i = 0; i < h.length(); i++) {

/* 打印出文件里面ArticleTitle域里面的内容 */

System.out.println(h.doc(i).get("ArticleTitle"));

/* 打印出文件里面ArticleText域里面的内容 */

System.out.println(h.doc(i).get("ArticleText"));

}

/* 关闭 */

searcher.close();

}

在Lucene里面没有update方法，我查了文档，我们只能删除以前的索引，然后增加新的索引。

具体步骤是，根据关键词，比如url这个唯一的东西，找到已经存在的索引项，然后删除它，

下面是我的一个根据网页URL删除索引的方法，里面主要使用了Item里面保存的一个docId的int类型的参数

这个数字是lucene内部每个索引的顺序号，类似于rowid

@SuppressWarnings("unchecked")

public synchronized void deleteByUrl(String url) {

synchronized (indexPath) {

try {

IndexReader indexReader = IndexReader.open(indexPath);

Iterator it = searchUrl(url).iterator();

while (it.hasNext()) {

indexReader.deleteDocument(((LuceneItem) it.next()).getDocId());

}

indexReader.close();

} catch (IOException e) {

System.out.println(e);

}

/**

* Lucene 2.4 搜索一个关键字的方法(Lucene Hits deprecated的解决方法)

* @param url

* @return

public List searchUrl(String url) {

try {

// 替换一些特殊字符，比如冒号等

url = StrTools.encodeURLForLucene(url);

IndexSearcher isearcher = new IndexSearcher(indexPath);

QueryParser parser = new QueryParser(FIELD_URL, getAnalyzer());

Query query = parser.parse(url);

// 下面的这个方法已经不推荐使用了。

// Hits hits = isearcher.search(query);

// 改为如下的方式

TopDocCollector collector = new TopDocCollector(10);

isearcher.search(query, collector);

ScoreDoc[] hits = collector.topDocs().scoreDocs;

List rtn = new LinkedList();

LuceneItem o;

for (int i = 0; i < hits.length; i++) {

Document doc = isearcher.doc(hits[i].doc);

o = new LuceneItem();

o.setDocId(hits[i].doc);

o.setUrl(doc.get(FIELD_URL));

o.setAuthor(doc.get(FIELD_AUTHOR));

o.setTitle(doc.get(FIELD_TITLE));

o.setDatetimeCreate(doc.get(FIELD_DATETIMECREATE));

o.setBody(doc.get(FIELD_BODY));

rtn.add(o);

}

isearcher.close();

return rtn;

} catch (Exception e) {

e.printStackTrace();

return null;

}

然后在增加索引的地方，先调用deleteByUrl方法删除可能已经存在的数据，然后再增加数据

public synchronized void IndexSingle(Item item) {

synchronized (indexPath) {

try {

// 先删除以前的数据

deleteByUrl(item.getUrl());

// 增加数据

IndexWriter writer = getIndexWriter();

writer.setMaxFieldLength(10000000);

Date start = new Date();

Document doc = new Document();// 一个文档相当与表的一条记录

doc.add(new Field(FIELD_URL, item.getUrl(), Field.Store.YES, Field.Index.ANALYZED));

doc.add(new Field(FIELD_AUTHOR, item.getAuthor(), Field.Store.YES, Field.Index.ANALYZED));

doc.add(new Field(FIELD_TITLE, item.getTitle(), Field.Store.YES, Field.Index.ANALYZED));

doc.add(new Field(FIELD_DATETIMECREATE, item.getDatetimeCreate(), Field.Store.YES, Field.Index.ANALYZED));

doc.add(new Field(FIELD_BODY, item.getBody(), Field.Store.YES, Field.Index.ANALYZED));

writer.addDocument(doc);

// writer.optimize();// 优化

writer.close();// 一定要关闭，否则不能把内存中的数据写到文件

Date end = new Date();

System.out.println("索引建立成功！！！！" + "用时" + (end.getTime() - start.getTime()) + "毫秒");

} catch (IOException e) {

System.out.println(e);

}

weixin_39569112

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫