直接查询索引,将想要的字段写入csv文件

18 篇文章 0 订阅
package test;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.spell.LuceneDictionary;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefIterator;
import org.junit.Test;

public class TermSearcher {
	
	public static final SimpleDateFormat SDF_CHILD_PATH = new SimpleDateFormat("yyyyMMdd_HHmmssSSS");
	public static Map<String, Integer> sizeMap=new HashMap<String, Integer>();
	public static void main(String[] args) throws IOException {
		searchTerms();
	}
	/**
	 * 搜索terms
	 */
	public static void searchTerms(){
		List<String> pathList=new ArrayList<String>();
		pathList.add("D:/newindex/1");
		pathList.add("D:/newindex/2");
		try{
			for(String indexReadPath : pathList){
				Directory directory = null;
			    directory=FSDirectory.open(new File(indexReadPath));//打开索引文件夹
			    IndexReader reader=DirectoryReader.open(directory);//读取目录
			    Fields fields = MultiFields.getFields(reader);
//			    for(String field : fields){
//			    	System.out.println(field);
//			    }
			  //输出某field下的各个term的数量
			    maptolist(fields,indexReadPath.split("/")[2]);
			    reader.close();//关闭资源
			    directory.close();//关闭连接
			    
			}
		}catch(Exception e){
		    e.printStackTrace();
		    }
		    
	}
	//输出某field下的各个term的数量
	public static void maptolist(Fields fields,String indexReadPath) throws IOException{
		List<String> fieldlist = new ArrayList<String>();
		fieldlist.add("brand_Name");
		fieldlist.add("virtual_Name");
		List<String> list = new ArrayList<String>();
		BytesRef byteRef = null;
        String byterefString="";
        int size=1;
        //fields
		for(String field : fieldlist){
			
			Terms terms = fields.terms(field);
	        TermsEnum termsEnum = terms.iterator(null);
	        while ((byteRef = termsEnum.next() )!= null) {
	        	byterefString=byteRef.utf8ToString();
	        	if(sizeMap.get(byterefString)!=null){
	        		size++;
	        	}
	        	sizeMap.put(byterefString, size);
	        	
	        }
	        
	        Set<String> key = sizeMap.keySet();
	        list.add(field);
	        for (Iterator<String> it = key.iterator(); it.hasNext();) {			
	        	String s = it.next();	
	        	list.add(s+","+sizeMap.get(s));
	        	//这里的s就是map中的key,map.get(s)就是key对应的value。		
	        	if(list.size()>=30000){
	            	write(list,"F:/Terms",indexReadPath);
	                list.clear();
	                list.add(field);
	            }
	        }
	        write(list,"F:/Terms",indexReadPath);
		}
		
	}
	
	public static void write(List<String> list, String path ,String indexReadPath){
		StringBuffer sbBuffer=new StringBuffer();
		for(String string : list){
			sbBuffer.append(string+"\n");
		}
        FileWriter fw = null;
        BufferedWriter bf = null;
        PrintWriter out = null;
        File file  = null;
        String name = path  + File.separator + indexReadPath+"-" +SDF_CHILD_PATH.format(new Date()) + ".csv";
        try {
            file = new File(name);
            if (!file.exists()) {
                file.createNewFile();
            }
            fw = new FileWriter(file);
            bf = new BufferedWriter(fw);
            out = new PrintWriter(bf);
            out.write(sbBuffer.toString());
            out.flush();
        } catch (IOException e) {
        } finally {
            if (out != null) {
                try {
                    out.close();
                } catch (RuntimeException e) {
                }
            }
            if (bf != null) {
                try {
                    bf.close();
                } catch (IOException e) {
                }
            }
            if (fw != null) {
                try {
                    fw.close();
                } catch (IOException e) {
                }
            }
        }
    
	}
	
	
	/**
     * 显示所有的索引
     * @throws IOException
     */
	@Test
    public static void showIndex() throws IOException {
    	String indexReadPath= "D:/newindex/1";
		Directory directory = null;
	    directory=FSDirectory.open(new File(indexReadPath));//打开索引文件夹
	    IndexReader reader=DirectoryReader.open(directory);//读取目录
        reader = DirectoryReader.open(directory);
        Fields fields = MultiFields.getFields(reader); //获取directory中所有的field
    //显示 field 中 context的所有的分词 
    Terms terms = fields.terms("brand_Name");
    TermsEnum termsEnum =  terms.iterator(null);
    BytesRef term = null;
    int count=1;
    while ((term=termsEnum.next()) !=null) {
        System.out.println("分词的内容>>>>>>>"+term.utf8ToString()+"\t");//分词的内容
        System.out.println("出现该分词的有文档的数量>>>>>>>>>"+termsEnum.docFreq()+"\t");//出现该分词的有文档的数量
        System.out.println("分词的总数>>>>>>>"+termsEnum.totalTermFreq()+"\t");//分词的总数
        DocsAndPositionsEnum docsAndPositionsEnum = termsEnum.docsAndPositions(null, null);
        //如果要查询的字段 没有被分词 ,docsAndPositionsEnum就会为空 继续循环
        if(docsAndPositionsEnum==null){
            continue;
        }
        int docId ;
        while ((docId = docsAndPositionsEnum.nextDoc())!= DocIdSetIterator.NO_MORE_DOCS) {
            Document document = reader.document(docId);//获取document对象
            System.out.println(docId+"\t");//分词的总数
            System.out.println("可以获取document中field的值>>>>>>>>"+document.get("brand_Name")+"\t");//可以获取document中field的值
            int freq = docsAndPositionsEnum.freq();//该document中 该分词出现的次数
            for (int i = 0; i < freq; i++) {
                System.out.println("分词的位置>>>>>>>"+docsAndPositionsEnum.nextPosition()+":"); //分词的位置
                 System.out.print("分词起始偏移量的位置>>>["+docsAndPositionsEnum.startOffset()+"");//分词起始偏移量的位置
                 System.out.println(docsAndPositionsEnum.endOffset()+"],>>>>分词结束偏移量的位置");//分词结束偏移量的位置
                 System.out.println(docsAndPositionsEnum.getPayload()+"\t");
            }
        }
        count++;
        if(count>=100){
        	return;
        }
    }
    
//            for (String field : fields) {}
        reader.close();
    }

	public static void getTerms(IndexReader reader,String field) throws IOException{
		System.out.println("---------------getTerms----------------");
		LuceneDictionary ld = new LuceneDictionary( reader, field );
	    BytesRefIterator iterator = ld.getEntryIterator();
	    BytesRef byteRef = null;
	    String outputString = "";
	    while ( ( byteRef = iterator.next() ) != null )
	    {
	    	System.out.println(">>>>>>>>>>>>>outputString"+outputString);
	        String term = byteRef.utf8ToString();
	        System.out.println(term);
	    }
	}
	
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值