lucene入门到项目开发

加入jar包 lucene-core-2.4.0.jar je-analysis-1.4.0.jar lucene-highlighter-2.4.1.jar lucene-analyzers-2.4.1.jar

先准备下工具类

package com.cs.lucene.utils;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStreamReader;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumberTools;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;

public class File2DocumentUtiles {

/**
*文件到document的转换
* @param filepath
* @return
*/
public static Document file2Document(String filepath) {

File file = new File(filepath) ;

Document doc = new Document();
doc.add(new Field("name",file.getName(),Store.YES,Index.ANALYZED)) ; //索引并分词
doc.add(new Field("content",readFileContent(file),Store.YES,Index.ANALYZED)) ; //索引并分词
doc.add(new Field("size",NumberTools.longToString(file.length()),Store.YES,Index.NOT_ANALYZED)) ; //索引不分词
doc.add(new Field("path",file.getPath(),Store.YES,Index.NO)) ; //不索引

return doc;
}
/**
* 根据文件读取文件内容
* @param file
* @return
*/
private static String readFileContent(File file) {

try {
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
StringBuffer content = new StringBuffer();

for(String line=null; (line = reader.readLine())!=null ;){
content.append(line).append("\n") ;
}
return content.toString() ;
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return null;
}

public static void printDocInfo(Document doc){
System.out.println("--------------------------");
System.out.println("name =" + doc.get("name"));
System.out.println("content =" + doc.get("content"));
System.out.println("size =" + NumberTools.stringToLong(doc.get("size")));
System.out.println("path =" + doc.get("path"));
}


}


先了解下分词器
package com.cs.lucene.analyzer;

import java.io.StringReader;

import jeasy.analysis.MMAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.junit.Test;

public class AnalyzerTest {
String text = "资源来自互联网吴朝辉wwwa的a-b放到" ;
Analyzer analyzer = new MMAnalyzer() ;

@Test
public void testAnalyze() throws Exception{
analyze(analyzer,text);
}

private void analyze(Analyzer analyzer2, String text2) throws Exception {
System.out.println("----------分词器-------------------");
TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(text)) ;
for(Token token = new Token();(token = tokenStream.next(token))!=null;){
System.out.println(token);
}
}


}


现在看看FSDirectory和RAMDirectory
package com.cs.lucene.directory;

import jeasy.analysis.MMAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.junit.Test;

import com.cs.lucene.utils.File2DocumentUtiles;

public class DirectoryTest {
//创建索引用的文件路径
String filePath = "E:\\EclipseStudyWorkspace\\LucenceTest\\lucenceDataSource\\资源来自互联网,版权归原创作者或原单位公司所有.txt";
//存放索引的目录
String indexPath = "E:\\EclipseStudyWorkspace\\LucenceTest\\lucenceIndex" ;
//分词器
Analyzer analyzer = new MMAnalyzer(); //je分词器


/**
* 利用FSDirectory 创建索引
* FSDirectory:在文件系统上存放
* @throws Exception
*/
@Test
public void testFSDirectory() throws Exception{
//测试文件系统目录
Directory dir = FSDirectory.getDirectory(indexPath) ;
Document doc = File2DocumentUtiles.file2Document(filePath);
//参数true表示是否删除原来的索引后再重新创建,MaxFieldLength.LIMITED:表示只对前10000个字做索引
IndexWriter indexWriter = new IndexWriter(dir,analyzer,MaxFieldLength.LIMITED) ; //没有参数true,添加索引
indexWriter.addDocument(doc) ;

indexWriter.close() ;
}
/**
* 利用RAMDirectory 创建索引
* RAMDirectory:在内存中存放
* 优点:读取快
* 缺点:重新开机,索引没了
* @throws Exception
*/
@Test
public void testRAMDirectory() throws Exception{
//测试文件系统目录
Directory dir = new RAMDirectory() ;
Document doc = File2DocumentUtiles.file2Document(filePath);
//参数true表示是否删除原来的索引后再重新创建,MaxFieldLength.LIMITED:表示只对前10000个字做索引
IndexWriter indexWriter = new IndexWriter(dir,analyzer,MaxFieldLength.LIMITED) ; //没有参数true,添加索引
indexWriter.addDocument(doc) ;

indexWriter.close() ;
}
/**
* 实际应用中,FSDirectory和RAMDirectory联合起来用
* 操控内存的索引要快,所以在运行时操作RAMDirectory,
* 但退出时必须保存到到文件系统上,所以退出时操控FSDirectory
* @throws Exception
*/
@Test
public void testRAMDirectoryAndFSDirectory() throws Exception{
//整个过程:从文件系统上读取所以到内存,运行时添加索引,此时的全部索引都在内存中,
//退出时再把全部保存到文件系统上

Directory fsDir = FSDirectory.getDirectory(indexPath) ;
//1.启动时读取
Directory ramDir = new RAMDirectory(fsDir) ;
//运行时操作ramDir
IndexWriter ramIndexWriter = new IndexWriter(ramDir,analyzer,MaxFieldLength.LIMITED);
//添加document
Document doc = File2DocumentUtiles.file2Document(filePath) ;
ramIndexWriter.addDocument(doc) ;
ramIndexWriter.close() ;//一定要关闭再合并,因为有缓存

//2.退出时保存
//参数true表示把以前的索引删掉,全部重写 (默认为false)
IndexWriter fsIndexWriter = new IndexWriter(fsDir,analyzer,true,MaxFieldLength.LIMITED);
//new Directory[]{ramDir}:要合并的目录
//addIndexesNoOptimize:表示不做优化,做优化检索时相对要慢,但占用的存储空间小
fsIndexWriter.addIndexesNoOptimize(new Directory[]{ramDir}) ;
fsIndexWriter.flush() ; //优化之前一定要先刷新缓存
fsIndexWriter.optimize() ; //优化一定要在关闭之前做,优化可以提高检索的速度
fsIndexWriter.close() ;
}
@Test
public void testOptimize() throws Exception{

Directory fsDir = FSDirectory.getDirectory(indexPath) ;
IndexWriter fsIndexWriter = new IndexWriter(fsDir,analyzer,MaxFieldLength.LIMITED);

fsIndexWriter.optimize() ;
fsIndexWriter.close() ;
}
}




现在来测测索引如何建立以及搜索
package com.cs.lucene.lucene;

import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import jeasy.analysis.MMAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.Scorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;

import com.cs.lucene.utils.File2DocumentUtiles;

public class IndexDao {
// 存放索引的目录
private String indexPath;
private Analyzer analyzer = null; // 分词器

public IndexDao() {
this.indexPath = "E:\\EclipseStudyWorkspace\\LucenceTest\\lucenceIndex";
this.analyzer = new MMAnalyzer(); // je分词器对中文支持很好

}

public IndexDao(Analyzer analyzer, String indexPath) {
this.analyzer = analyzer;
this.indexPath = indexPath;
}

/**
* 接受一个QuerString字符串 搜索索引并返回结果
*
*/
public QueryResult search(String queryString, int firstResult,
int maxResults) throws Exception {
// 1.把要搜索的fields解析为Query
String[] fields = { "name", "content" };
// boosts:需要的理由,标题和内容中出现关键字的得分不一样,在标题中出现时的得分理应高些
Map<String, Float> boosts = new HashMap<String, Float>();
boosts.put("name", 3.0f);
boosts.put("content", 1.0f); // 默认值

QueryParser queryParser = new MultiFieldQueryParser(fields, analyzer,
boosts);// 多field搜索
Query query = queryParser.parse(queryString);

return search(query, firstResult, maxResults);
}

/*
* 接受一个Query对象 搜索索引并返回结果
*/
public QueryResult search(Query query, int firstResult, int maxResults)
throws Exception {
IndexSearcher indexSearcher = null;
// 2.进行查询
indexSearcher = new IndexSearcher(indexPath);
Filter filter = null; // 搜索时的过滤器
/** ********过滤器************* */
// 过滤器:把结果再过滤一遍,效率会很低
// filter = new
// RangeFilter("size",NumberTools.longToString(200),NumberTools.longToString(500),true,true);
/** ************************* */
Sort sort = new Sort();
// 默认是按升序排序,参数true:排序结果改为按降序排列
sort.setSort(new SortField[] { new SortField("size", true) });
TopDocs topDocs = indexSearcher.search(query, filter, 10000, sort);

int recordCount = topDocs.totalHits;

/** ***********准备高亮器******************** */
Formatter formatter = new SimpleHTMLFormatter("<font color='red'>",
"</font>");
Scorer scorer = new QueryScorer(query);
Highlighter highlighter = new Highlighter(formatter, scorer);

// 50表示只显示50个字符 这里的50个字符是有关键字的左右部分(称之为最佳部分) 这里只是测试用
Fragmenter fragmenter = new SimpleFragmenter(500);
highlighter.setTextFragmenter(fragmenter);
/** ************************************ */

// 3.取出当前的数据
List<Document> recordList = new ArrayList<Document>();
int end = Math.min(firstResult + maxResults, recordCount);
for (int i = firstResult; i < end; i++) {
ScoreDoc scoreDoc = topDocs.scoreDocs[i];

int docSn = scoreDoc.doc;
Document doc = indexSearcher.doc(docSn);

// 使用高亮器
String hc = highlighter.getBestFragment(analyzer, "content", doc
.get("content"));

// 如果content中没有搜索的关键字,则截取content的前200个字符
if (hc == null) {
String content = doc.get("content");
int endIndex = Math.min(200, content.length());
hc = content.substring(0, endIndex);
}
doc.getField("content").setValue(hc);

recordList.add(doc);
}
// 打开结果
/*
* for(ScoreDoc scoreDoc :topDocs.scoreDocs){ int docSn = scoreDoc.doc ;
* //文档内部编号 Document doc = indexSearcher.doc(docSn); //根据编号查找相应的文档
* File2DocumentUtiles.printDocInfo(doc) ; }
*/
// 4.返回结果
return new QueryResult(recordCount, recordList);
}

/*
* 建立索引并保存
*/
public void save(String filePath) throws Exception {
Document doc = File2DocumentUtiles.file2Document(filePath);
// 在添加doc的时候,可以设定文档的分数,不过不建议这样做
// doc.setBoost(1.0f); //默认值

// 参数true表示是否删除原来的索引后再重新创建,MaxFieldLength.LIMITED:表示只对前10000个字做索引
IndexWriter indexWriter = new IndexWriter(indexPath, analyzer, false,
MaxFieldLength.LIMITED);
indexWriter.addDocument(doc);
indexWriter.commit();
indexWriter.optimize();
indexWriter.close();

}

public void save(File file) throws Exception {
save(file.getAbsolutePath()) ;
}

/*
* 建立索引并保存 可以直接传入的是目录
*/
public void saveDirectory(File file) throws Exception {
if (file.isFile()) { // 如果是文件就建索引并保存
save(file.getAbsolutePath());
return;
}
File[] childs = file.listFiles();
for (int i = 0; i < childs.length; i++) {
File f = childs[i];
if (f.isDirectory()) {// 如果是目录就递归调用
saveDirectory(f);
} else {
save(f.getAbsolutePath());
}
}
}

/**
* 测试递归
*/
public void save(File file, int pointer) throws Exception {
StringBuffer str = new StringBuffer();
for (int i = 0; i < pointer; i++) {
str.append("--");
}
if (file.isFile()) { // 如果是文件就建索引并保存
System.out.println(str + file.getName());
return;
}
File[] childs = file.listFiles();
for (int i = 0; i < childs.length; i++) {
File f = childs[i];
if (f.isDirectory()) {// 如果是目录就递归调用
System.out.println(str + f.getName());
save(f, pointer + 1);
} else {
System.out.println(str + f.getName());
}

}
}
}


package com.cs.lucene.lucene;

import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.document.Document;

public class QueryResult {
private int recordCount = 0;
private List<Document> recordResults = new ArrayList<Document>();

public QueryResult(int recordCount, List<Document> recordResults) {
this.recordCount = recordCount;
this.recordResults = recordResults;
}

public int getRecordCount() {
return recordCount;
}

public void setRecordCount(int recordCount) {
this.recordCount = recordCount;
}

public List<Document> getRecordResults() {
return recordResults;
}

public void setRecordResults(List<Document> recordResults) {
this.recordResults = recordResults;
}

}



测试索引
package com.cs.lucene.lucene;

import java.io.File;

import jeasy.analysis.MMAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.junit.Test;

import com.cs.lucene.utils.File2DocumentUtiles;

public class IndexDaoTest {


private IndexDao indexDao = new IndexDao() ;

/*
*搜索索引库,并返回结果
*/
@Test
public void testSearch() throws Exception{
String queryString = "www*" ;
QueryResult queryResults = indexDao.search(queryString ,0, 10) ;
//测试结果
System.out.println("总共有【"+queryResults.getRecordCount()+"】条匹配结果");

for(int i =0 ; i<queryResults.getRecordResults().size();i++){
Document doc = queryResults.getRecordResults().get(i) ;
File2DocumentUtiles.printDocInfo(doc) ;
}
}
/*
* 测试索引源文件并保存到索引库
*/
@Test
public void testSave() throws Exception{
String filePath2 = "E:\\EclipseStudyWorkspace\\LucenceTest\\lucenceDataSource\\readme2.txt";
//源文件
//String filePath = "E:\\EclipseStudyWorkspace\\LucenceTest\\lucenceDataSource\\资源来自互联网,版权归原创作者或原单位公司所有吴朝辉.txt";

indexDao.save(filePath2);
}
/**
* 用来给目录建索引并保存到索引库
*/
@Test
public void testSaveDir() throws Exception{
String filepath = "E:\\EclipseStudyWorkspace\\LucenceTest\\lucenceDataSource\\www" ;
File file = new File(filepath) ;
indexDao.saveDirectory(file);
}
}


最后我们来看看lucene的查询功能
package com.cs.lucene.query;

import java.util.Date;

import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.NumberTools;
import org.apache.lucene.document.DateTools.Resolution;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.RangeQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.BooleanClause.Occur;
import org.junit.Test;

import com.cs.lucene.lucene.IndexDao;
import com.cs.lucene.lucene.QueryResult;
import com.cs.lucene.utils.File2DocumentUtiles;

public class QueryTest {

IndexDao indexDao = new IndexDao() ;

/*
* 关键词查询
*/
@Test
public void testTermQuery() throws Exception{
Term term = new Term("name","资源");
Query query = new TermQuery(term);

//查询打印结果
QueryAndPrintResult(query) ;
}
/*
* 范围索引
* 数字在query中都是字符串,所以要借助NumberTools工具类做转换
*/
@Test
public void testRangeQuery() throws Exception{
Term lowerTerm = new Term("size",NumberTools.longToString(200));
Term upperTerm = new Term("size",NumberTools.longToString(500));
//true表示是否包含边界
Query query = new RangeQuery(lowerTerm,upperTerm,true) ;

/*
Term lowerTerm2 = new Term("size","200");
Term upperTerm2 = new Term("size","500");
Query query = new RangeQuery(lowerTerm2,upperTerm2,true) ; //true表示是否包含边界
*/
//查询打印结果
QueryAndPrintResult(query) ;
}

/*
* 测试NumberTools和DateTools
*/
@Test
public void testNumberToolsAndDateTools() throws Exception{

System.out.println("数字测试:");
System.out.println(NumberTools.longToString(200));
System.out.println(NumberTools.longToString(500));
System.out.println(NumberTools.stringToLong("000000000000dw"));

System.out.println("日期测试:");
System.out.println(DateTools.dateToString(new Date(), Resolution.SECOND));
System.out.println(DateTools.dateToString(new Date(), Resolution.DAY));
System.out.println(DateTools.stringToDate("20101005080855"));
}

/*
* 通配符查询
* ?:代表一个字符,*:代表0个或多个字符
*/
@Test
public void testWildcardQuery() throws Exception{
Term term = new Term("name","*me");
Query query = new WildcardQuery(term) ;

//查询打印结果
QueryAndPrintResult(query) ;
}
/*
* 短语查询:查询包含多个短语的query
*/
@Test
public void testPhraseQuery() throws Exception{
PhraseQuery phraseQuery = new PhraseQuery() ;
phraseQuery.add(new Term("name","资源")) ;
phraseQuery.add(new Term("name","作者")) ;

//setSlop:用来设置两个短语之间的最多可以隔多少个字符
phraseQuery.setSlop(20);

//查询打印结果
QueryAndPrintResult(phraseQuery) ;
}
/**
* 布尔查询:非常重要
* 三种关系:
* 1.MUST和MUST:取得两个查询子句的交集。
* 2.MUST和MUST_NOT:包含MUST但并且查询结果中不包含MUST_NOT的检索结果。
* 3.SHOULT和SHOULT:表示"或"关系,最终检索结果为所有检索子句的并集。
* 注意:有些组合是没有意义的
* @throws Exception
*/
@Test
public void testBooleanQuery() throws Exception{
//条件1
PhraseQuery phraseQuery = new PhraseQuery() ;
phraseQuery.add(new Term("name","资源")) ;
phraseQuery.add(new Term("name","作者")) ;
phraseQuery.setSlop(20);

//条件2
Term lowerTerm2 = new Term("size","200");
Term upperTerm2 = new Term("size","500");
Query rangeQuery = new RangeQuery(lowerTerm2,upperTerm2,true) ; //true表示是否包含边界

//合并两个查询
BooleanQuery booleanQuery = new BooleanQuery() ;
booleanQuery.add(phraseQuery, Occur.MUST) ;
booleanQuery.add(rangeQuery,Occur.MUST) ;

//查询打印结果
QueryAndPrintResult(booleanQuery) ;
}



private void QueryAndPrintResult(Query query) throws Exception{

System.out.println("相对应的查询字符串:"+query);
QueryResult qr = indexDao.search(query, 0, 100) ;
System.out.println("总共有【"+qr.getRecordCount()+"】条匹配结果");

//打印结果
for(int i =0 ; i<qr.getRecordResults().size();i++){
Document doc = qr.getRecordResults().get(i) ;
File2DocumentUtiles.printDocInfo(doc) ;
}
}
}



通过以上学习 应该对lucene开发没什么问题了 恭喜您 您又向前迈进了一步
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值