Lucene 学习

最新推荐文章于 2024-06-21 16:37:39 发布

Jason5563

最新推荐文章于 2024-06-21 16:37:39 发布

阅读量87

点赞数

分类专栏：搜索引擎文章标签： lucene Myeclipse Java

搜索引擎专栏收录该内容

1 篇文章 0 订阅

订阅专栏

[size=large]1.对三国演义预处理[/size]

package tool;

/**
 * 编码 为 GB2312
 */
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintWriter;

public class Prepare {
	public static void  main( String [] args ) throws IOException
	{
		String inputFile="E:\\MyEclipse Workspace\\SearchDemo\\三国演义.txt";
		String outputDir="E:\\MyEclipse Workspace\\SearchDemo\\处理后电子书";
		if(! new File(outputDir).exists())
			new File(outputDir).mkdirs();

		preProcess( new File(inputFile) , outputDir);
	}


	// 分割为 n 个小文件
	static void preProcess( File file, String outputDir ) throws IOException
	{
		BufferedReader br=new BufferedReader( new FileReader(file) );

		int filePointer=1;
		int MAX_SIZE=10240;

		PrintWriter out=new PrintWriter( outputDir+"\\output"+filePointer+".txt" );
		StringBuffer sb=new StringBuffer();
		String line=br.readLine();

		while(line!=null)
		{
			line=replace(line);
			sb.append(line).append("\r\n");
			while(sb.toString().getBytes().length>=MAX_SIZE)
			{
				out.print(sb.toString());
				out.close();
				filePointer++;
				out=new PrintWriter( outputDir+"\\output"+filePointer+".txt" );
				sb=new StringBuffer();
			}
			line=br.readLine();
		}
		br.close();
		out.close();
	}


	// 全角 转 半角
	static char[]oldC=new char[]{'，','。','《','》','【','】','？','：','（','）'};
	static char[]newC=new char[]{',','.','<','>','[',']','?',':','(',')'};
	static String replace( String line )
	{
		for( int i=0; i<oldC.length; i++)
			line=line.replace(oldC[i], newC[i]);
		return line;
	}

}

[size=large]2. 为三国演义创建索引[/size]

package tool;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;

import jeasy.analysis.MMAnalyzer;;

public class IndexProcesser {

	private String INDEX_STORE_PATH="E:\\MyEclipse Workspace\\SearchDemo\\index";

	public void createIndex(String inputDir)
	{
		try{

			// 创建 索引工具，参数分别为：索引存放地址，分词分析器，是否删除此地址 原来的文件
			IndexWriter writer = new IndexWriter(INDEX_STORE_PATH, new MMAnalyzer(), true);
			File [] files=new File(inputDir).listFiles();

			for(int i=0;i<files.length; i++)
			{
				Document doc=new Document();

				Field field=new Field("fileName",files[i].getName(),Field.Store.YES, Field.Index.TOKENIZED );
				doc.add(field);

				field=new Field("content", loadFileToString(files[i]), Field.Store.NO, Field.Index.TOKENIZED);
				doc.add(field);
				writer.addDocument(doc);
			}

			// 一定要关闭 让缓存信息写入磁盘
			writer.close();
		} catch( Exception e) {
			e.printStackTrace();
		}

	}

	private String loadFileToString(File file) throws IOException {
		BufferedReader br=new BufferedReader(  new FileReader(file));

		StringBuffer sb=new StringBuffer();
		String line=br.readLine();
		while(line!=null)
		{
			sb.append(line);
			line=br.readLine();
		}
		return sb.toString();
	}

	public static void main(String[] args) {
		IndexProcesser ip=new IndexProcesser();
		ip.createIndex("E:\\MyEclipse Workspace\\SearchDemo\\处理后电子书");
	}

}

[size=large]3. 搜索：[/size]

package tool;

import java.io.IOException;

import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;

public class Search {

	//   搜索：
	public void indexSearch(String searchType, String searchKey) throws IOException
	{
		//索引所在文件夹
		IndexSearcher searcher=new IndexSearcher("E:\\MyEclipse Workspace\\SearchDemo\\index");

		//搜索单元， searchType代表要搜索的field，searchKey代表 关键字
		Term t=new Term(searchType,searchKey);

		//Query q=new TermQuery(t);

		// TearmDocs 枚举对象
		TermDocs docs=searcher.getIndexReader().termDocs(t);
		while(docs.next())
		{
			System.out.println(docs.freq());	//关键字出现次数
			System.out.println( searcher.getIndexReader().document( docs.doc() ) ); // 所在文档
		}
	}

	public static void main(String[] args) throws IOException {
		Search s=new Search();
		s.indexSearch("content", "孔明");
	}

}

4. 部分结果：
3
Document<stored/uncompressed,indexed,tokenized<fileName:output100.txt>>
1
Document<stored/uncompressed,indexed,tokenized<fileName:output101.txt>>
1
Document<stored/uncompressed,indexed,tokenized<fileName:output104.txt>>
1
Document<stored/uncompressed,indexed,tokenized<fileName:output106.txt>>
1
Document<stored/uncompressed,indexed,tokenized<fileName:output108.txt>>
4
Document<stored/uncompressed,indexed,tokenized<fileName:output110.txt>>

Jason5563

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Lucene 学习

[size=large]1.对三国演义预处理[/size][code="java"]package tool;/** * 编码为 GB2312 */import java.io.BufferedReader;import java.io.File;import java.io.FileReader;import java.io.IOException;...
复制链接

扫一扫