public class TestNorms { public void createIndex() throws IOException { Directory d = new SimpleFSDirectory(new File("d:/falconTest/lucene3/norms")); IndexWriter writer = new IndexWriter(d, new StandardAnalyzer(Version.LUCENE_30), true, IndexWriter.MaxFieldLength.UNLIMITED); Field field = new Field("desc", "", Field.Store.YES, Field.Index.ANALYZED); Document doc = new Document(); field.setValue("Hello students was drive"); doc.add(field); writer.addDocument(doc); writer.optimize(); writer.close(); } public void search() throws IOException { Directory d = new SimpleFSDirectory(new File("d:/falconTest/lucene3/norms")); IndexReader reader = IndexReader.open(d); IndexSearcher searcher = new IndexSearcher(reader); TopDocs docs = searcher.search(new TermQuery(new Term("desc","drove")), 10); System.out.println(docs.totalHits); } public static void main(String[] args) throws IOException { TestNorms test= new TestNorms(); test.createIndex(); test.search(); } }
public class PorterStemAnalyzer extends Analyzer { @Override public TokenStream tokenStream(String fieldName, Reader reader) { return new PorterStemFilter(new LowerCaseTokenizer(reader)); } }
把此分词器用在你的程序中,就能够识别单复数和规则的词型变化了。
public void createIndex() throws IOException { Directory d = new SimpleFSDirectory(new File("d:/falconTest/lucene3/norms")); IndexWriter writer = new IndexWriter(d, new PorterStemAnalyzer(), true, IndexWriter.MaxFieldLength.UNLIMITED);
Field field = new Field("desc", "", Field.Store.YES, Field.Index.ANALYZED); Document doc = new Document(); field.setValue("Hello students was driving cars professionally"); doc.add(field);
drove processing drove 3 PARADIGM 0: normal form 'DROVE' part of speech:0 PARADIGM 1: normal form 'DROVE' part of speech:2 PARADIGM 2: normal form 'DRIVE' part of speech:2
was processing was 3 PARADIGM 0: normal form 'BE' part of speech:3 PARADIGM 1: normal form 'BE' part of speech:3 PARADIGM 2: normal form 'BE' part of speech:3