ucene + IKAnalyzer 中文分词及索引,简单实例

最新推荐文章于 2021-02-16 06:04:33 发布

wtsoftware

最新推荐文章于 2021-02-16 06:04:33 发布

阅读量439

点赞数

文章标签： import file exception string lucene date

原文:http://www.dev26.com/blog/article/351

import org.apache.lucene.document.Document;  
import org.apache.lucene.document.Field;  
import org.apache.lucene.document.DateTools;  
import org.apache.lucene.index.IndexWriter;  
import org.apache.lucene.queryParser.QueryParser;  
import org.apache.lucene.search.Hits;  
import org.apache.lucene.search.IndexSearcher;  
import org.apache.lucene.search.Query;  
import org.apache.lucene.store.Directory;  
import org.apache.lucene.store.FSDirectory;  
import org.mira.lucene.analysis.IK_CAnalyzer;  
     
import java.io.File;  
import java.io.FileNotFoundException;  
import java.io.IOException;  
import java.io.FileReader;  
import java.util.Date;  
     
public class Searcher {  
    private static String INDEX_DIR = Searcher.class.getResource("/").getPath()+"/data/index";//"c:\\lucene\\index";  
    private static String DOC_DIR =   Searcher.class.getResource("/").getPath();//"c:\\lucene\\doc";  
    public static void main(String[] args) throws Exception {  
         String queryString;  
         queryString = "测试";  
         File indexDir = new File(INDEX_DIR);  
         File docDir = new File(DOC_DIR);  
         Date start = new Date();  
        //必须先建索引  
        try {  
             IndexWriter writer = new IndexWriter(INDEX_DIR, new IK_CAnalyzer(), true);  
             System.out.println("Indexing to directory '" + INDEX_DIR + "'...");  
             indexDocs(writer, docDir);  
             System.out.println("Optimizing...");  
             writer.optimize();  
             writer.close();  
             Date end = new Date();  
             System.out.println(end.getTime() - start.getTime() + " total milliseconds");  
     
         } catch (IOException e) {  
             System.out.println(" caught a " + e.getClass() +  
                    "\n with message: " + e.getMessage());  
         }  
        if (!indexDir.exists() || !indexDir.isDirectory()) {  
            throw new Exception(indexDir  
                     + " does not exist or is not a directory.");  
         }  
         search(indexDir, queryString);  
     }  
     
    private static void indexDocs(IndexWriter writer, File file)  
            throws IOException {  
        if (file.canRead()) {  
            if (file.isDirectory()) {  
                 String[] files = file.list();  
                if (files != null) {  
                    for (int i = 0; i < files.length; i++) {  
                         indexDocs(writer, new File(file, files[i]));  
                     }  
                 }  
             } else {  
                 System.out.println("adding " + file);  
                try {  
                     writer.addDocument(getDocument(file));  
                 }  
                catch (FileNotFoundException fnfe) {  
                    //  
                 }  
             }  
         }  
     }  
     
    private static Document getDocument(File f)  
            throws java.io.FileNotFoundException {  
         Document doc = new Document();  
         doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.UN_TOKENIZED));  
         doc.add(new Field("modified",  
                 DateTools.timeToString(f.lastModified(), DateTools.Resolution.MINUTE),  
                 Field.Store.YES, Field.Index.UN_TOKENIZED));  
         doc.add(new Field("contents", new FileReader(f)));  
        return doc;  
     }  
     
    public static void search(File indexDir, String q) throws Exception {  
         Directory fsDir = FSDirectory.getDirectory(indexDir);  
         IndexSearcher is = new IndexSearcher(fsDir);// ① 打开索引  
         Query query = new QueryParser("contents", new IK_CAnalyzer()).parse(q); // ② 分析查询  
        long start = new Date().getTime();  
         Hits hits = is.search(query);// ③ 搜索索引  
        long end = new Date().getTime();  
         System.err.println("Found " + hits.length() + " document(s) (in "
                 + (end - start) + "milliseconds) that matched query" + q + ":");  
        for (int i = 0; i < hits.length(); i++) {  
             Document doc = hits.doc(i); // ④ 得到匹配的文档  
             System.out.println("file: " + doc.get("path"));  
         }  
     }  
}

正向全切分分词器：org.mira.lucene.analysis.IK_CAnalyzer（适合建索引时使用）

正向最大全切分分词器：org.mira.lucene.analysis.MIK_CAnalyzer（适合用户输入检索时使用）