Lucene 学习

[size=large]1.对三国演义预处理[/size]

package tool;

/**
* 编码 为 GB2312
*/
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintWriter;

public class Prepare {
public static void main( String [] args ) throws IOException
{
String inputFile="E:\\MyEclipse Workspace\\SearchDemo\\三国演义.txt";
String outputDir="E:\\MyEclipse Workspace\\SearchDemo\\处理后电子书";
if(! new File(outputDir).exists())
new File(outputDir).mkdirs();

preProcess( new File(inputFile) , outputDir);
}


// 分割为 n 个小文件
static void preProcess( File file, String outputDir ) throws IOException
{
BufferedReader br=new BufferedReader( new FileReader(file) );

int filePointer=1;
int MAX_SIZE=10240;

PrintWriter out=new PrintWriter( outputDir+"\\output"+filePointer+".txt" );
StringBuffer sb=new StringBuffer();
String line=br.readLine();

while(line!=null)
{
line=replace(line);
sb.append(line).append("\r\n");
while(sb.toString().getBytes().length>=MAX_SIZE)
{
out.print(sb.toString());
out.close();
filePointer++;
out=new PrintWriter( outputDir+"\\output"+filePointer+".txt" );
sb=new StringBuffer();
}
line=br.readLine();
}
br.close();
out.close();
}


// 全角 转 半角
static char[]oldC=new char[]{',','。','《','》','【','】','?',':','(',')'};
static char[]newC=new char[]{',','.','<','>','[',']','?',':','(',')'};
static String replace( String line )
{
for( int i=0; i<oldC.length; i++)
line=line.replace(oldC[i], newC[i]);
return line;
}

}


[size=large]2. 为 三国演义 创建索引[/size]
package tool;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;

import jeasy.analysis.MMAnalyzer;;

public class IndexProcesser {

private String INDEX_STORE_PATH="E:\\MyEclipse Workspace\\SearchDemo\\index";

public void createIndex(String inputDir)
{
try{

// 创建 索引工具,参数分别为:索引存放地址,分词分析器,是否删除此地址 原来的文件
IndexWriter writer = new IndexWriter(INDEX_STORE_PATH, new MMAnalyzer(), true);
File [] files=new File(inputDir).listFiles();

for(int i=0;i<files.length; i++)
{
Document doc=new Document();

Field field=new Field("fileName",files[i].getName(),Field.Store.YES, Field.Index.TOKENIZED );
doc.add(field);

field=new Field("content", loadFileToString(files[i]), Field.Store.NO, Field.Index.TOKENIZED);
doc.add(field);
writer.addDocument(doc);
}

// 一定要关闭 让缓存信息写入磁盘
writer.close();
} catch( Exception e) {
e.printStackTrace();
}

}

private String loadFileToString(File file) throws IOException {
BufferedReader br=new BufferedReader( new FileReader(file));

StringBuffer sb=new StringBuffer();
String line=br.readLine();
while(line!=null)
{
sb.append(line);
line=br.readLine();
}
return sb.toString();
}

public static void main(String[] args) {
IndexProcesser ip=new IndexProcesser();
ip.createIndex("E:\\MyEclipse Workspace\\SearchDemo\\处理后电子书");
}

}


[size=large]3. 搜索:[/size]
package tool;

import java.io.IOException;

import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;

public class Search {

// 搜索:
public void indexSearch(String searchType, String searchKey) throws IOException
{
//索引所在文件夹
IndexSearcher searcher=new IndexSearcher("E:\\MyEclipse Workspace\\SearchDemo\\index");

//搜索单元, searchType代表要搜索的field,searchKey代表 关键字
Term t=new Term(searchType,searchKey);

//Query q=new TermQuery(t);

// TearmDocs 枚举对象
TermDocs docs=searcher.getIndexReader().termDocs(t);
while(docs.next())
{
System.out.println(docs.freq()); //关键字出现次数
System.out.println( searcher.getIndexReader().document( docs.doc() ) ); // 所在文档
}
}

public static void main(String[] args) throws IOException {
Search s=new Search();
s.indexSearch("content", "孔明");
}

}


4. 部分结果:
3
Document<stored/uncompressed,indexed,tokenized<fileName:output100.txt>>
1
Document<stored/uncompressed,indexed,tokenized<fileName:output101.txt>>
1
Document<stored/uncompressed,indexed,tokenized<fileName:output104.txt>>
1
Document<stored/uncompressed,indexed,tokenized<fileName:output106.txt>>
1
Document<stored/uncompressed,indexed,tokenized<fileName:output108.txt>>
4
Document<stored/uncompressed,indexed,tokenized<fileName:output110.txt>>
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值