[size=large]1.对三国演义预处理[/size]
[size=large]2. 为 三国演义 创建索引[/size]
[size=large]3. 搜索:[/size]
4. 部分结果:
3
Document<stored/uncompressed,indexed,tokenized<fileName:output100.txt>>
1
Document<stored/uncompressed,indexed,tokenized<fileName:output101.txt>>
1
Document<stored/uncompressed,indexed,tokenized<fileName:output104.txt>>
1
Document<stored/uncompressed,indexed,tokenized<fileName:output106.txt>>
1
Document<stored/uncompressed,indexed,tokenized<fileName:output108.txt>>
4
Document<stored/uncompressed,indexed,tokenized<fileName:output110.txt>>
package tool;
/**
* 编码 为 GB2312
*/
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintWriter;
public class Prepare {
public static void main( String [] args ) throws IOException
{
String inputFile="E:\\MyEclipse Workspace\\SearchDemo\\三国演义.txt";
String outputDir="E:\\MyEclipse Workspace\\SearchDemo\\处理后电子书";
if(! new File(outputDir).exists())
new File(outputDir).mkdirs();
preProcess( new File(inputFile) , outputDir);
}
// 分割为 n 个小文件
static void preProcess( File file, String outputDir ) throws IOException
{
BufferedReader br=new BufferedReader( new FileReader(file) );
int filePointer=1;
int MAX_SIZE=10240;
PrintWriter out=new PrintWriter( outputDir+"\\output"+filePointer+".txt" );
StringBuffer sb=new StringBuffer();
String line=br.readLine();
while(line!=null)
{
line=replace(line);
sb.append(line).append("\r\n");
while(sb.toString().getBytes().length>=MAX_SIZE)
{
out.print(sb.toString());
out.close();
filePointer++;
out=new PrintWriter( outputDir+"\\output"+filePointer+".txt" );
sb=new StringBuffer();
}
line=br.readLine();
}
br.close();
out.close();
}
// 全角 转 半角
static char[]oldC=new char[]{',','。','《','》','【','】','?',':','(',')'};
static char[]newC=new char[]{',','.','<','>','[',']','?',':','(',')'};
static String replace( String line )
{
for( int i=0; i<oldC.length; i++)
line=line.replace(oldC[i], newC[i]);
return line;
}
}
[size=large]2. 为 三国演义 创建索引[/size]
package tool;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import jeasy.analysis.MMAnalyzer;;
public class IndexProcesser {
private String INDEX_STORE_PATH="E:\\MyEclipse Workspace\\SearchDemo\\index";
public void createIndex(String inputDir)
{
try{
// 创建 索引工具,参数分别为:索引存放地址,分词分析器,是否删除此地址 原来的文件
IndexWriter writer = new IndexWriter(INDEX_STORE_PATH, new MMAnalyzer(), true);
File [] files=new File(inputDir).listFiles();
for(int i=0;i<files.length; i++)
{
Document doc=new Document();
Field field=new Field("fileName",files[i].getName(),Field.Store.YES, Field.Index.TOKENIZED );
doc.add(field);
field=new Field("content", loadFileToString(files[i]), Field.Store.NO, Field.Index.TOKENIZED);
doc.add(field);
writer.addDocument(doc);
}
// 一定要关闭 让缓存信息写入磁盘
writer.close();
} catch( Exception e) {
e.printStackTrace();
}
}
private String loadFileToString(File file) throws IOException {
BufferedReader br=new BufferedReader( new FileReader(file));
StringBuffer sb=new StringBuffer();
String line=br.readLine();
while(line!=null)
{
sb.append(line);
line=br.readLine();
}
return sb.toString();
}
public static void main(String[] args) {
IndexProcesser ip=new IndexProcesser();
ip.createIndex("E:\\MyEclipse Workspace\\SearchDemo\\处理后电子书");
}
}
[size=large]3. 搜索:[/size]
package tool;
import java.io.IOException;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
public class Search {
// 搜索:
public void indexSearch(String searchType, String searchKey) throws IOException
{
//索引所在文件夹
IndexSearcher searcher=new IndexSearcher("E:\\MyEclipse Workspace\\SearchDemo\\index");
//搜索单元, searchType代表要搜索的field,searchKey代表 关键字
Term t=new Term(searchType,searchKey);
//Query q=new TermQuery(t);
// TearmDocs 枚举对象
TermDocs docs=searcher.getIndexReader().termDocs(t);
while(docs.next())
{
System.out.println(docs.freq()); //关键字出现次数
System.out.println( searcher.getIndexReader().document( docs.doc() ) ); // 所在文档
}
}
public static void main(String[] args) throws IOException {
Search s=new Search();
s.indexSearch("content", "孔明");
}
}
4. 部分结果:
3
Document<stored/uncompressed,indexed,tokenized<fileName:output100.txt>>
1
Document<stored/uncompressed,indexed,tokenized<fileName:output101.txt>>
1
Document<stored/uncompressed,indexed,tokenized<fileName:output104.txt>>
1
Document<stored/uncompressed,indexed,tokenized<fileName:output106.txt>>
1
Document<stored/uncompressed,indexed,tokenized<fileName:output108.txt>>
4
Document<stored/uncompressed,indexed,tokenized<fileName:output110.txt>>