org.apache.lucene.search.spell
Class SpellChecker
java.lang.Object
org.apache.lucene.search.spell.SpellChecker
Lucene拼写检查类
使用例子:
SpellChecker spellchecker = new SpellChecker(spellIndexDirectory);
// To index a field of a user index:
spellchecker.indexDictionary(new LuceneDictionary(my_lucene_reader, a_field));
// To index a file containing words:
spellchecker.indexDictionary(new PlainTextDictionary(new File("myfile.txt")));
String[] suggestions = spellchecker.suggestSimilar("misspelt", 5);
SpellChecker有三个构造方法,可以根据给定的Directory实例创建SpellChecker对象进行后续操作;
PlainTextDictionary实现了Dictionary接口,并提供3个构造方法,参数分别为:File、InputStream、Reader
上面例子中根据一个文本文件创建PlainTextDirectory字典,该文本文件的格式为每一行包含一个词,如:
word1
word2
word3
其他:FileDictionary, HighFrequencyDictionary, LuceneDictionary
SpellChecker方法:
String [] suggestSimilar(String word,int numSug)
参数:
word-需要检查的词
numSug-返回的suggest词数
其他的:String [] suggestSimilar(...),可以根据精度等进行,详情请参考官方文档;
完整代码示例:
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.spell.PlainTextDictionary;
import org.apache.lucene.search.spell.SpellChecker;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class SpellCheckerTest {
private static String filepath = "C:\\Users\\Mr_Tank_\\Desktop\\BaseTest\\dictionaryfile.txt";
private Document document;
private Directory directory;
private IndexWriter indexWriter;
private SpellChecker spellchecker;
private IndexReader indexReader;
private IndexSearcher indexSearcher;
private IndexWriterConfig getConfig() {
return new IndexWriterConfig(Version.LUCENE_43, new IKAnalyzer(true));
}
private IndexWriter getIndexWriter() {
directory = new RAMDirectory();
try {
return new IndexWriter(directory, getConfig());
} catch (IOException e) {
e.printStackTrace();
return null;
}
}
/**
* Create index for test
*
* @param content
* @throws IOException
*/
public void createIndex(String content) {
indexWriter = getIndexWriter();
document = new Document();
document.add(new TextField("content", content, Field.Store.YES));
try {
indexWriter.addDocument(document);
indexWriter.commit();
indexWriter.close();
} catch (IOException e) {
e.printStackTrace();
}
}
public ScoreDoc[] gethits(String content) {
try {
indexReader = DirectoryReader.open(directory);
indexSearcher = new IndexSearcher(indexReader);
QueryParser parser = new QueryParser(Version.LUCENE_43, "content", new IKAnalyzer(true));
Query query = parser.parse(content);
TopDocs td = indexSearcher.search(query, 1000);
return td.scoreDocs;
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
/**
* @param scoreDocs
* @return
* @throws IOException
*/
public List<Document> getDocumentList(ScoreDoc[] scoreDocs) throws IOException {
List<Document> documentList = null;
if (scoreDocs.length >= 1) {
documentList = new ArrayList<Document>();
for (int i = 0; i < scoreDocs.length; i++) {
documentList.add(indexSearcher.doc(scoreDocs[i].doc));
}
}
return documentList;
}
public String[] search(String word, int numSug) {
directory = new RAMDirectory();
try {
spellchecker = new SpellChecker(directory);
spellchecker.indexDictionary(new PlainTextDictionary(new File(filepath)), getConfig(), true);
return getSuggestions(spellchecker, word, numSug);
} catch (IOException e) {
e.printStackTrace();
return null;
}
}
private String[] getSuggestions(SpellChecker spellchecker, String word, int numSug) throws IOException {
return spellchecker.suggestSimilar(word, numSug);
}
public static void main(String[] args) throws IOException {
SpellCheckerTest spellCheckerTest = new SpellCheckerTest();
spellCheckerTest.createIndex("开源中国-找到您想要的开源项目,分享和交流");
spellCheckerTest.createIndex("CSDN-全球最大中文IT社区");
String word = "开园中国";
/*
ScoreDoc[] scoreDocs = spellCheckerTest.gethits(word);
List<Document> documentList = spellCheckerTest.getDocumentList(scoreDocs);
if (documentList.size() >= 1) {
for (Document d : documentList) {
System.out.println("搜索结果:" + d.get("content"));
}
}
*/
String[] suggest = spellCheckerTest.search(word, 5);
if (suggest != null && suggest.length >= 1) {
for (String s : suggest) {
System.out.println("您是不是要找:" + s);
}
} else {
System.out.println("拼写正确");
}
}
}
dictionaryfile.txt:
中华人民共和国
开源中国
开源社区
Lucene
拼写检查
Lucene4.3.1