中文分词-lucene 第一个版本

最新推荐文章于 2021-02-27 12:00:00 发布
屋顶小黑猫
最新推荐文章于 2021-02-27 12:00:00 发布
阅读量313
点赞数
分类专栏：笔记文章标签：中文分词 lucene
本文链接：https://blog.csdn.net/wenmeishuai/article/details/46327781
版权
笔记专栏收录该内容
73 篇文章 1 订阅
订阅专栏
前提，这里之所以用第一个版本，以为这货的的版本问题造成的冲突
1.jar包
IKAnalyzer3.2.3Stable.jar
lucene-core-3.6.0.jar
package com.xx.utils;

import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;
import org.wltea.analyzer.lucene.IKQueryParser;
import org.wltea.analyzer.lucene.IKSimilarity;
import com.ishehui.entity.Article;
import com.ishehui.entity.Conchs;

/*****
 * IKAnalyzer 中文分词 && 相似度匹配
 * @author wenmeishuai
 *
 */
public class IkTest {
	 /* 创建简单中文分析器 创建索引使用的分词器必须和查询时候使用的分词器一样，否则查询不到想要的结果 */  
	 private static Analyzer analyzer = new IKAnalyzer(true);  
	 // 索引保存目录  
	 private static File indexFile = new File("d:\\indexDir\\");///data/work/videos/conchsindex/   d:\\indexDir\\
	 //需要搜索出的列
	 private static String[] fieldName = {"id","content"}; 
    
	 /** 
	 * 查看IKAnalyzer 分词器是如何将一个完整的词组进行分词的 
	 *  
	 * @param text 
	 * @param isMaxWordLength 
     */  
	public static String splitWord(String text) {  
	    try {  
	        // 创建分词对象  
//	        Analyzer analyzer = new IKAnalyzer(isMaxWordLength);  
	        StringReader reader = new StringReader(text);  
	        // 分词  
	        TokenStream ts = analyzer.tokenStream("", reader);  
	        CharTermAttribute term = ts.getAttribute(CharTermAttribute.class);  
	        // 遍历分词数据  
	        System.out.print("IKAnalyzer把关键字拆分的结果是：");  
	        StringBuffer b = new StringBuffer();
	        while (ts.incrementToken()) {  
	            System.out.print("【" + term.toString() + "】");  
	            b.append("【" + term.toString() + "】");
	        }  
	        reader.close();  
	        return b.toString();
	    } catch (IOException e) {  
	        e.printStackTrace();  
	    }  
	    return "";
	 }  
	/** 
	 * 创建索引文件到磁盘中永久保存 
	 * isCreateAll true :重新构建索引  false:在原有基础上添加
	 */  
    public static void createConchsIndexFile(List<Conchs> cs,boolean isCreateAll) {  
        long startTime = System.currentTimeMillis();  
        System.out.println("*****************创建索引开始**********************");  
        Directory directory = null;  
        IndexWriter indexWriter = null;  
        try {  
            // 创建哪个版本的IndexWriterConfig，根据参数可知lucene是向下兼容的，选择对应的版本就好  
            IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_36, analyzer);  
            // 创建磁盘目录对象  
            directory = new SimpleFSDirectory(indexFile);  
            indexWriter = new IndexWriter(directory, indexWriterConfig);  
            // indexWriter = new IndexWriter(directory, analyzer, true,IndexWriter.MaxFieldLength.UNLIMITED);  
            // 这上面是使用内存保存索引的创建索引写入对象的例子，和这里的实现方式不一样，但是效果是一样的  
            if(isCreateAll){
            	// 为了避免重复插入数据，每次测试前 先删除之前的索引  
            	indexWriter.deleteAll();  
            }
            
            // 获取实体对象  
            for (int i = 0; i < cs.size(); i++) {  
            	Conchs article = cs.get(i);  
                // indexWriter添加索引  
                Document doc = new Document();  
                doc.add(new Field("id", article.getId().toString(),Field.Store.YES, Field.Index.NOT_ANALYZED));  
                doc.add(new Field("content", article.getContent().toString(),Field.Store.YES, Field.Index.ANALYZED));  
                // 添加到索引中去  
                indexWriter.addDocument(doc);  
                System.out.println("索引添加成功：第" + (i + 1) + "次！！");  
            }  
        } catch (IOException e) {  
            e.printStackTrace();  
        } finally {  
            if (indexWriter != null) {  
                try {  
                    indexWriter.close();  
                } catch (IOException e) {  
                    e.printStackTrace();  
                }  
            }  
            if (directory != null) {  
                try {  
                    directory.close();  
                } catch (IOException e) {  
                    e.printStackTrace();  
                }  
            }  
        }  
        System.out.println("创建索引文件成功，总共花费" + (System.currentTimeMillis() - startTime) + "毫秒。");  
        System.out.println("*****************创建索引结束**********************");  
    }   
    /*****
	 * 从索引中搜索  （多字段 单条件）
	 * @param keyword
	 * @return
	 */
    public static List<String> getReultFromIndex(String keyword) {
    	
    	IndexSearcher isearcher = null;  
    	IndexReader indexReader = null;  
    	try {  
       	 	indexReader = IndexReader.open(FSDirectory.open(indexFile));  
             
       	 	//实例化搜索器     
       	 	isearcher = new IndexSearcher(indexReader);             
       	 	//在索引器中使用IKSimilarity相似度评估器  
       	 	isearcher.setSimilarity(new IKSimilarity());  
             
//           //使用IKQueryParser查询分析器构造Query对象  
//           QueryParser queryParser = new MultiFieldQueryParser(Version.LUCENE_36, fieldName,analyzer);
//           //利用queryParser解析传递过来的检索关键字，完成Query对象的封装  
//           Query query = queryParser.parse(keyword);
       	 	Query query = IKQueryParser.parseMultiField(fieldName, keyword);
           
       	 	splitWord(keyword);// 显示拆分结果  
           
       	 	//搜索相似度最高的20条记录  
       	 	TopDocs topDocs = isearcher.search(query , 50);  
       	 	System.out.println("命中：" + topDocs.totalHits);  
       	 	//输出结果  
       	 	ScoreDoc[] scoreDocs = topDocs.scoreDocs;  
           
       	 	List<String> returnList = new ArrayList<String>();
       	 	for (int i = 0; i < scoreDocs.length; i++){  
               Document targetDoc = isearcher.doc(scoreDocs[i].doc);  
               System.out.println("命中内容： id:"+targetDoc.get("id")+" content:"+targetDoc.get("content"));  
               
               returnList.add(targetDoc.get("id"));
       	 	}             
       	 	return returnList;
    	} catch (CorruptIndexException e) {  
    		e.printStackTrace();  
    	} catch (LockObtainFailedException e) {  
    		e.printStackTrace();  
    	} catch (Exception e) {  
    		e.printStackTrace();  
    	} finally{  
           if(isearcher != null){  
               try {  
                   isearcher.close();  
               } catch (IOException e) {  
                   e.printStackTrace();  
               }  
           }    
    	} 
    	return null;
	 }
    /****
     * 从内存中检索  （多字段 单条件）
     * @param keyword
     * @param cs
     * @return
     */
    public static List<String> getReultFromMemory(String keyword,List<Conchs> cs){
    		
            Directory directory = null;  
            IndexWriter iwriter = null;  
            IndexSearcher isearcher = null;  
            try {  
                //建立内存索引对象将数据加载到内存中  
                directory = new RAMDirectory();    
                iwriter = new IndexWriter(directory, analyzer, true , IndexWriter.MaxFieldLength.LIMITED); 
                for(Conchs text:cs){
                	Document doc = new Document();  
                	doc.add(new Field("content", text.getContent(), Field.Store.YES, Field.Index.ANALYZED));  
                	doc.add(new Field("id", text.getId().toString(), Field.Store.YES, Field.Index.ANALYZED));  
                	System.out.println("id:"+text.getId()+" text:"+text.getContent());
                	iwriter.addDocument(doc);  
                }
                iwriter.close();  
                  
                //实例化搜索器   从内存中搜索出结果  
                isearcher = new IndexSearcher(directory);             
                //在索引器中使用IKSimilarity相似度评估器  
                isearcher.setSimilarity(new IKSimilarity());  
                  
                //构造Query对象  
//                QueryParser queryParser = new MultiFieldQueryParser(Version.LUCENE_36, fieldName,analyzer);
                //利用queryParser解析传递过来的检索关键字，完成Query对象的封装  
//                Query query = IKQueryParser.parseMultiField(fieldName, keyword);
//                Query query = queryParser.parse(keyword);  
                Query query = IKQueryParser.parseMultiField(fieldName, keyword);
                // 显示拆分结果 给开发者
                splitWord(keyword);   
                
                //搜索相似度最高的20条记录  
                TopDocs topDocs = isearcher.search(query , 20);  
                System.out.println("命中：" + topDocs.totalHits);  
                //输出结果  
                ScoreDoc[] scoreDocs = topDocs.scoreDocs;  
                
                List<String> returnList = new ArrayList<String>();
                for (int i = 0; i < scoreDocs.length; i++){  
                    Document targetDoc = isearcher.doc(scoreDocs[i].doc);  
                    System.out.println("内容：" + targetDoc.toString()+" id:"+targetDoc.get("id"));  
                    
                    returnList.add(targetDoc.get("id"));
                }             
                return returnList;
            } catch (CorruptIndexException e) {  
                e.printStackTrace();  
            } catch (LockObtainFailedException e) {  
                e.printStackTrace();  
            } catch (Exception e) {  
                e.printStackTrace();  
            } finally{  
                if(isearcher != null){  
                    try {  
                        isearcher.close();  
                    } catch (IOException e) {  
                        e.printStackTrace();  
                    }  
                }  
                if(directory != null){  
                    try {  
                        directory.close();  
                    } catch (IOException e) {  
                        e.printStackTrace();  
                    }  
                }  
            } 
            return null;
    	}     
    /** 
     * 直接读取索引文件，查询索引记录 
     *  
     * @throws IOException 
     */  
    public static void openIndexFile() {  
        long startTime = System.currentTimeMillis();  
        System.out.println("*****************读取索引开始**********************");  
        List<Article> articles = new ArrayList<Article>();  
        // 得到索引的目录  
        Directory directory = null;  
        IndexReader indexReader = null;  
        try {  
            directory = new SimpleFSDirectory(indexFile);  
            // 根据目录打开一个indexReader  
//            indexReader = IndexReader.open(directory);  
            indexReader = IndexReader.open(directory,false);  
            System.out.println("在索引文件中总共插入了" + indexReader.maxDoc() + "条记录。");  
            // 获取第一个插入的document对象  
            Document minDoc = indexReader.document(0);  
            // 获取最后一个插入的document对象  
            Document maxDoc = indexReader.document(indexReader.maxDoc() - 1);  
            // document对象的get(字段名称)方法获取字段的值  
            System.out.println("第一个插入的document对象的标题是：" + minDoc.get("title"));  
            System.out.println("最后一个插入的document对象的标题是：" + maxDoc.get("title"));  
            
            int docLength = indexReader.maxDoc();  
            for (int i = 0; i < docLength; i++) {  
                Document doc = indexReader.document(i);  
                Article article = new Article();  
                if (doc.get("id") == null) {  
                    System.out.println("id为空");  
                } else {  
                    article.setId(Integer.parseInt(doc.get("id")));  
//                    if(article.getId().intValue() == 1){
//                    	//删除索引
//                    	indexReader.deleteDocument(1);
//                    	System.out.println(indexReader.hasDeletions());
//                    }
                    article.setTitle(doc.get("title"));  
                    article.setContent(doc.get("content"));  
                    articles.add(article);  
                }  
            }  
            System.out.println("显示所有插入的索引记录：");  
            for (Article article : articles) {  
                System.out.println(article);  
            }  
        } catch (IOException e) {  
            e.printStackTrace();  
        } finally {  
            if (indexReader != null) {  
                try {  
                    indexReader.close();  
                } catch (IOException e) {  
                    e.printStackTrace();  
                }  
            }  
            if (directory != null) {  
                try {  
                    directory.close();  
                } catch (IOException e) {  
                    e.printStackTrace();  
                }  
            }  
        }  
        System.out.println("直接读取索引文件成功，总共花费" + (System.currentTimeMillis() - startTime) + "毫秒。");  
        System.out.println("*****************读取索引结束**********************");  
    }  
    //test
    public static List<String> getReultFromIndex11() {
    	String keyword = "分词";
    	String[] fieldName = {"id","title","content"}; 
    	String[] field = {"","Analyzer",keyword};
    	IndexSearcher isearcher = null;  
    	IndexReader indexReader = null;  
    	try {  
       	 indexReader = IndexReader.open(FSDirectory.open(indexFile));  
             
           //实例化搜索器     
           isearcher = new IndexSearcher(indexReader);             
           //在索引器中使用IKSimilarity相似度评估器  
           isearcher.setSimilarity(new IKSimilarity());  
           
           //使用IKQueryParser查询分析器构造Query对象  
//           QueryParser queryParser = new MultiFieldQueryParser(Version.LUCENE_36, fieldName,analyzer);
//           //利用queryParser解析传递过来的检索关键字，完成Query对象的封装  
//           Query query = queryParser.parse(keyword);
           
           //使用IKQueryParser查询分析器构造Query对象  
           Query query = IKQueryParser.parseMultiField(fieldName, field);
           
           splitWord(keyword);// 显示拆分结果  
           
           //搜索相似度最高的20条记录  
           TopDocs topDocs = isearcher.search(query , 20);  
           System.out.println("命中：" + topDocs.totalHits);  
           //输出结果  
           ScoreDoc[] scoreDocs = topDocs.scoreDocs;  
           
           List<String> returnList = new ArrayList<String>();
           for (int i = 0; i < scoreDocs.length; i++){  
               Document targetDoc = isearcher.doc(scoreDocs[i].doc);  
               System.out.println("内容：" + targetDoc.toString()+" id:"+targetDoc.get("id"));  
               
               returnList.add(targetDoc.get("id"));
           }             
           return returnList;
    	} catch (CorruptIndexException e) {  
           e.printStackTrace();  
    	} catch (LockObtainFailedException e) {  
           e.printStackTrace();  
    	} catch (Exception e) {  
           e.printStackTrace();  
    	} finally{  
           if(isearcher != null){  
               try {  
                   isearcher.close();  
               } catch (IOException e) {  
                   e.printStackTrace();  
               }  
           }    
    	} 
    	return null;
	 }
     
    public static void main(String[] args) throws IOException {  
//	 	String text="寻找真心朋友，只限女生,偶尔一起嗨皮下";  
//        StringReader sr=new StringReader(text);  
//        IKSegmenter ik=new IKSegmenter(sr, true);  
//        Lexeme lex=null;  
//        while((lex=ik.next())!=null){  
//            System.out.print(lex.getLexemeText()+"|");  
//        }  
       
//	 createIndexFile();
//	 getReultFromIndex11();
//	 splitWord("最喜欢小猫");
	 getReultFromIndex("我喜欢小猫");
    } 
}