现在如果一个txt文件中包含了ANSI编码的文本文件和Unicode编码的文本文件,如下图这种:
当用Lucene来建索引搜索时,这个文档中的内容是搜索不到的。
需要搜索的文本在附件中提供。
创建索引的源代码:
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
public class IndexFiles {
// 主要代码 索引docDir文件夹下文档,索引文件在INDEX_DIR文件夹中
@SuppressWarnings("deprecation")
public static void main(String[] args) {
File indexDir = new File("e:\\Lucene\\index");
File docDir = new File("e:\\Lucene\\content");
try {
// 索引器
IndexWriter standardWriter = new IndexWriter(FSDirectory
.open(indexDir), new StandardAnalyzer(
Version.LUCENE_CURRENT), true,
IndexWriter.MaxFieldLength.LIMITED);
// 不建立复合式索引文件,默认的情况下是复合式的索引文件
standardWriter.setUseCompoundFile(false);
String[] files = docDir.list();
for (String fileStr : files) {
File file = new File(docDir, fileStr);
if (!file.isDirectory()) {
Document doc = new Document();
// 文件名称,可查询,不分词
String fileName = file.getName().substring(0,
file.getName().indexOf("."));
System.out.println("fileName:"+fileName);
doc.add(new Field("name", fileName, Field.Store.YES,
Field.Index.NOT_ANALYZED));
// 文件路径,可查询,不分词
String filePath = file.getPath();
doc.add(new Field("path", filePath, Field.Store.YES,
Field.Index.NOT_ANALYZED));
// 文件内容,需要检索
doc.add(new Field("content", new FileReader(file)));
standardWriter.addDocument(doc);
}
}
standardWriter.optimize();
// 关闭索引器
standardWriter.close();
} catch (IOException e) {
System.out.println(" caught a " + e.getClass()
+ "\n with message: " + e.getMessage());
}
}
}
搜索的源代码:
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
/**
* 检索索引
*/
public class SearchFiles {
/** Simple command-line based search demo. */
@SuppressWarnings("deprecation")
public static void main(String[] args) throws Exception {
String index = "E:\\Lucene\\index";
String field = "content";
String queries = null;
boolean raw = false;
// 要显示条数
int hitsPerPage = 10;
// searching, so read-only=true
IndexReader reader = IndexReader.open(
FSDirectory.open(new File(index)), true); // only
Searcher searcher = new IndexSearcher(reader);
Analyzer standardAnalyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, field,
standardAnalyzer);
while (true) {
if (queries == null) // prompt the user
System.out.println("Enter query: ");
String line = in.readLine();
if (line == null || line.length() == -1)
break;
line = line.trim();
if (line.length() == 0)
break;
Query query = parser.parse(line);
System.out.println("Searching for: " + query.toString(field));
doPagingSearch(in, searcher, query, hitsPerPage, raw,
queries == null);
}
reader.close();
}
public static void doPagingSearch(BufferedReader in, Searcher searcher,
Query query, int hitsPerPage, boolean raw, boolean interactive)
throws IOException {
TopScoreDocCollector collector = TopScoreDocCollector.create(
hitsPerPage, false);
searcher.search(query, collector);
ScoreDoc[] hits = collector.topDocs().scoreDocs;
int end, numTotalHits = collector.getTotalHits();
System.out.println(numTotalHits + " total matching documents");
int start = 0;
end = Math.min(hits.length, start + hitsPerPage);
for (int i = start; i < end; i++) {
Document doc = searcher.doc(hits[i].doc);
String path = doc.get("path");
if (path != null) {
System.out.println((i + 1) + ". " + path);
} else {
System.out
.println((i + 1) + ". " + "No path for this document");
}
}
}
}