package test;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.spell.LuceneDictionary;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefIterator;
import org.junit.Test;
public class TermSearcher {
public static final SimpleDateFormat SDF_CHILD_PATH = new SimpleDateFormat("yyyyMMdd_HHmmssSSS");
public static Map<String, Integer> sizeMap=new HashMap<String, Integer>();
public static void main(String[] args) throws IOException {
searchTerms();
}
/**
* 搜索terms
*/
public static void searchTerms(){
List<String> pathList=new ArrayList<String>();
pathList.add("D:/newindex/1");
pathList.add("D:/newindex/2");
try{
for(String indexReadPath : pathList){
Directory directory = null;
directory=FSDirectory.open(new File(indexReadPath));//打开索引文件夹
IndexReader reader=DirectoryReader.open(directory);//读取目录
Fields fields = MultiFields.getFields(reader);
// for(String field : fields){
// System.out.println(field);
// }
//输出某field下的各个term的数量
maptolist(fields,indexReadPath.split("/")[2]);
reader.close();//关闭资源
directory.close();//关闭连接
}
}catch(Exception e){
e.printStackTrace();
}
}
//输出某field下的各个term的数量
public static void maptolist(Fields fields,String indexReadPath) throws IOException{
List<String> fieldlist = new ArrayList<String>();
fieldlist.add("brand_Name");
fieldlist.add("virtual_Name");
List<String> list = new ArrayList<String>();
BytesRef byteRef = null;
String byterefString="";
int size=1;
//fields
for(String field : fieldlist){
Terms terms = fields.terms(field);
TermsEnum termsEnum = terms.iterator(null);
while ((byteRef = termsEnum.next() )!= null) {
byterefString=byteRef.utf8ToString();
if(sizeMap.get(byterefString)!=null){
size++;
}
sizeMap.put(byterefString, size);
}
Set<String> key = sizeMap.keySet();
list.add(field);
for (Iterator<String> it = key.iterator(); it.hasNext();) {
String s = it.next();
list.add(s+","+sizeMap.get(s));
//这里的s就是map中的key,map.get(s)就是key对应的value。
if(list.size()>=30000){
write(list,"F:/Terms",indexReadPath);
list.clear();
list.add(field);
}
}
write(list,"F:/Terms",indexReadPath);
}
}
public static void write(List<String> list, String path ,String indexReadPath){
StringBuffer sbBuffer=new StringBuffer();
for(String string : list){
sbBuffer.append(string+"\n");
}
FileWriter fw = null;
BufferedWriter bf = null;
PrintWriter out = null;
File file = null;
String name = path + File.separator + indexReadPath+"-" +SDF_CHILD_PATH.format(new Date()) + ".csv";
try {
file = new File(name);
if (!file.exists()) {
file.createNewFile();
}
fw = new FileWriter(file);
bf = new BufferedWriter(fw);
out = new PrintWriter(bf);
out.write(sbBuffer.toString());
out.flush();
} catch (IOException e) {
} finally {
if (out != null) {
try {
out.close();
} catch (RuntimeException e) {
}
}
if (bf != null) {
try {
bf.close();
} catch (IOException e) {
}
}
if (fw != null) {
try {
fw.close();
} catch (IOException e) {
}
}
}
}
/**
* 显示所有的索引
* @throws IOException
*/
@Test
public static void showIndex() throws IOException {
String indexReadPath= "D:/newindex/1";
Directory directory = null;
directory=FSDirectory.open(new File(indexReadPath));//打开索引文件夹
IndexReader reader=DirectoryReader.open(directory);//读取目录
reader = DirectoryReader.open(directory);
Fields fields = MultiFields.getFields(reader); //获取directory中所有的field
//显示 field 中 context的所有的分词
Terms terms = fields.terms("brand_Name");
TermsEnum termsEnum = terms.iterator(null);
BytesRef term = null;
int count=1;
while ((term=termsEnum.next()) !=null) {
System.out.println("分词的内容>>>>>>>"+term.utf8ToString()+"\t");//分词的内容
System.out.println("出现该分词的有文档的数量>>>>>>>>>"+termsEnum.docFreq()+"\t");//出现该分词的有文档的数量
System.out.println("分词的总数>>>>>>>"+termsEnum.totalTermFreq()+"\t");//分词的总数
DocsAndPositionsEnum docsAndPositionsEnum = termsEnum.docsAndPositions(null, null);
//如果要查询的字段 没有被分词 ,docsAndPositionsEnum就会为空 继续循环
if(docsAndPositionsEnum==null){
continue;
}
int docId ;
while ((docId = docsAndPositionsEnum.nextDoc())!= DocIdSetIterator.NO_MORE_DOCS) {
Document document = reader.document(docId);//获取document对象
System.out.println(docId+"\t");//分词的总数
System.out.println("可以获取document中field的值>>>>>>>>"+document.get("brand_Name")+"\t");//可以获取document中field的值
int freq = docsAndPositionsEnum.freq();//该document中 该分词出现的次数
for (int i = 0; i < freq; i++) {
System.out.println("分词的位置>>>>>>>"+docsAndPositionsEnum.nextPosition()+":"); //分词的位置
System.out.print("分词起始偏移量的位置>>>["+docsAndPositionsEnum.startOffset()+"");//分词起始偏移量的位置
System.out.println(docsAndPositionsEnum.endOffset()+"],>>>>分词结束偏移量的位置");//分词结束偏移量的位置
System.out.println(docsAndPositionsEnum.getPayload()+"\t");
}
}
count++;
if(count>=100){
return;
}
}
// for (String field : fields) {}
reader.close();
}
public static void getTerms(IndexReader reader,String field) throws IOException{
System.out.println("---------------getTerms----------------");
LuceneDictionary ld = new LuceneDictionary( reader, field );
BytesRefIterator iterator = ld.getEntryIterator();
BytesRef byteRef = null;
String outputString = "";
while ( ( byteRef = iterator.next() ) != null )
{
System.out.println(">>>>>>>>>>>>>outputString"+outputString);
String term = byteRef.utf8ToString();
System.out.println(term);
}
}
}
直接查询索引,将想要的字段写入csv文件
最新推荐文章于 2022-01-11 15:28:39 发布