Lucene 字符编码问题

现在如果一个txt文件中包含了ANSI编码的文本文件和Unicode编码的文本文件,如下图这种:



 当用Lucene来建索引搜索时,这个文档中的内容是搜索不到的。

 

需要搜索的文本在附件中提供。

 

创建索引的源代码:

import java.io.File;
import java.io.FileReader;
import java.io.IOException;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class IndexFiles {
	// 主要代码 索引docDir文件夹下文档,索引文件在INDEX_DIR文件夹中
	@SuppressWarnings("deprecation")
	public static void main(String[] args) {

		File indexDir = new File("e:\\Lucene\\index");
		File docDir = new File("e:\\Lucene\\content");

		try {
			// 索引器
			IndexWriter standardWriter = new IndexWriter(FSDirectory
					.open(indexDir), new StandardAnalyzer(
					Version.LUCENE_CURRENT), true,
					IndexWriter.MaxFieldLength.LIMITED);
			// 不建立复合式索引文件,默认的情况下是复合式的索引文件
			standardWriter.setUseCompoundFile(false);
			String[] files = docDir.list();
			for (String fileStr : files) {
				File file = new File(docDir, fileStr);
				if (!file.isDirectory()) {
					Document doc = new Document();
					// 文件名称,可查询,不分词
					String fileName = file.getName().substring(0,
							file.getName().indexOf("."));
					System.out.println("fileName:"+fileName);
					doc.add(new Field("name", fileName, Field.Store.YES,
							Field.Index.NOT_ANALYZED));
					// 文件路径,可查询,不分词
					String filePath = file.getPath();
					doc.add(new Field("path", filePath, Field.Store.YES,
							Field.Index.NOT_ANALYZED));
					// 文件内容,需要检索
					doc.add(new Field("content", new FileReader(file)));
					standardWriter.addDocument(doc);
				}
			}
			standardWriter.optimize();
			// 关闭索引器
			standardWriter.close();
		} catch (IOException e) {
			System.out.println(" caught a " + e.getClass()
					+ "\n with message: " + e.getMessage());
		}
	}
}

 

搜索的源代码:

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

/**
 * 检索索引
 */
public class SearchFiles {

	/** Simple command-line based search demo. */
	@SuppressWarnings("deprecation")
	public static void main(String[] args) throws Exception {

		String index = "E:\\Lucene\\index";
		String field = "content";
		String queries = null;
		boolean raw = false;
		// 要显示条数
		int hitsPerPage = 10;

		// searching, so read-only=true
		IndexReader reader = IndexReader.open(
				FSDirectory.open(new File(index)), true); // only

		Searcher searcher = new IndexSearcher(reader);
		Analyzer standardAnalyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);

		BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
		QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, field,
				standardAnalyzer);
		while (true) {
			if (queries == null) // prompt the user
				System.out.println("Enter query: ");

			String line = in.readLine();

			if (line == null || line.length() == -1)
				break;

			line = line.trim();
			if (line.length() == 0)
				break;

			Query query = parser.parse(line);
			System.out.println("Searching for: " + query.toString(field));

			doPagingSearch(in, searcher, query, hitsPerPage, raw,
					queries == null);
		}
		reader.close();
	}

	public static void doPagingSearch(BufferedReader in, Searcher searcher,
			Query query, int hitsPerPage, boolean raw, boolean interactive)
			throws IOException {

		TopScoreDocCollector collector = TopScoreDocCollector.create(
				hitsPerPage, false);
		searcher.search(query, collector);
		ScoreDoc[] hits = collector.topDocs().scoreDocs;

		int end, numTotalHits = collector.getTotalHits();
		System.out.println(numTotalHits + " total matching documents");

		int start = 0;

		end = Math.min(hits.length, start + hitsPerPage);

		for (int i = start; i < end; i++) {
			Document doc = searcher.doc(hits[i].doc);
			String path = doc.get("path");
			if (path != null) {
				System.out.println((i + 1) + ". " + path);
			} else {
				System.out
						.println((i + 1) + ". " + "No path for this document");
			}
		}
	}
}

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值