搜索引擎工具类

最新推荐文章于 2024-04-21 19:51:19 发布

ruanzy888888

最新推荐文章于 2024-04-21 19:51:19 发布

阅读量148

点赞数

分类专栏：开发宝典文章标签：搜索引擎 lucene F#

本文链接：https://blog.csdn.net/ruanzy888888/article/details/83886165

版权

开发宝典专栏收录该内容

46 篇文章 0 订阅

订阅专栏

/**
 * 搜索引擎工具类
 * @author ruanzhiyong6496
 * @version 1.0
 */
public class Lucene
{
	private static String INDEX_DIR = "D:\\index";// 索引存放目录
	private static String DATA_DIR = "D:\\small";// 小文件存放的目录

	/**
	 * 将大文件切割为小文件
	 * 
	 * @param filepath
	 *            大文件路径
	 * @param outputdir
	 *            小文件輸出目錄
	 * @param size
	 *            小文件尺寸
	 */
	private static void splitToSmallFiles(String filepath)
	{
		int filePointer = 0;
		int MAX_SIZE = 1024 * 10;
		BufferedWriter writer = null;
		int index1 = filepath.lastIndexOf("/");
		int index2 = filepath.lastIndexOf(".");
		String fileName = filepath.substring(index1 + 1, index2);
		try
		{
			File dir = new File(DATA_DIR);
			if (!dir.exists())
			{
				dir.mkdir();
			}
			BufferedReader reader = new BufferedReader(new FileReader(filepath));
			StringBuffer buffer = new StringBuffer();
			String line = reader.readLine();
			while (line != null)
			{
				buffer.append(line).append("\r\n");
				if (buffer.toString().getBytes().length >= MAX_SIZE)
				{
					File file = new File(dir, fileName + filePointer + ".txt");
					writer = new BufferedWriter(new FileWriter(file));
					writer.write(buffer.toString());
					writer.close();
					filePointer++;
					buffer = new StringBuffer();
				}
				line = reader.readLine();
			}
			System.out.println("The file hava splited to small files !");
		}
		catch (FileNotFoundException e)
		{
			System.out.println("file not found !");
			e.printStackTrace();
		}
		catch (IOException e)
		{
			e.printStackTrace();
		}
	}

	/**
	 * 索引dataDir下的.txt文件，并储存在indexDir下，返回索引的文件数量
	 * 
	 * @param indexDir
	 * @param dataDir
	 * @return int
	 * @throws IOException
	 */
	private static int index() throws IOException
	{

		File dataDr = new File(DATA_DIR);
		if (!dataDr.exists() || !dataDr.isDirectory())
		{
			throw new IOException(dataDr
					+ " does not exist or is not a directory");
		}

		IndexWriter writer = new IndexWriter(FSDirectory.open(new File(
				INDEX_DIR)), new StandardAnalyzer(Version.LUCENE_CURRENT),
				true, IndexWriter.MaxFieldLength.LIMITED);// 有变化的地方

		indexDirectory(writer, DATA_DIR);
		int numIndexed = writer.numDocs();
		writer.optimize();
		writer.close();
		return numIndexed;
	}

	/**
	 * 循环遍历目录下的所有.txt文件并进行索引
	 * 
	 * @param writer
	 * @param dir
	 * @throws IOException
	 */
	private static void indexDirectory(IndexWriter writer, String dir)
			throws IOException
	{

		File dr = new File(dir);
		if (!dr.exists())
		{
			return;
		}
		File[] files = dr.listFiles();
		for (int i = 0; i < files.length; i++)
		{
			File f = files[i];
			if (f.isDirectory())
			{
				indexDirectory(writer, f.getName()); // recurse
			}
			else if (f.getName().endsWith(".txt"))
			{
				indexFile(writer, f);
			}
		}
	}

	/**
	 * 对单个txt文件进行索引
	 * 
	 * @param writer
	 * @param f
	 * @throws IOException
	 */
	private static void indexFile(IndexWriter writer, File f)
			throws IOException
	{

		if (f.isHidden() || !f.exists() || !f.canRead())
		{
			return;
		}

		// System.out.println("Indexing " + f.getCanonicalPath());
		Document doc = new Document();
		doc.add(new Field("contents", new FileReader(f)));// 有变化的地方
		doc.add(new Field("filename", f.getCanonicalPath(), Field.Store.YES,
				Field.Index.ANALYZED));// 有变化的地方

		writer.addDocument(doc);
	}

	/**
	 * 查询
	 * 
	 * @param indexDir
	 * @param q
	 * @throws Exception
	 */
	public static void search(String filepath, String keyword, int topnum)
	{

		try
		{
			splitToSmallFiles(filepath);
			index();
			IndexSearcher is = new IndexSearcher(FSDirectory.open(new File(
					INDEX_DIR)), true);// read-only
			String field = "contents";

			QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, field,
					new StandardAnalyzer(Version.LUCENE_CURRENT));// 有变化的地方
			Query query = parser.parse(keyword);

			TopScoreDocCollector collector = TopScoreDocCollector.create(
					topnum, false);// 有变化的地方

			long start = new Date().getTime();// start time

			is.search(query, collector);
			ScoreDoc[] hits = collector.topDocs().scoreDocs;

			// System.out.println(hits.length);
			for (int i = 0; i < hits.length; i++)
			{
				Document doc = is.doc(hits[i].doc);// new method is.doc()
				System.out.println(doc.getField("filename"));
				// System.out.println(doc.getField("filename") + " "
				// + hits[i].toString() + " ");
			}
			long end = new Date().getTime();// end time

			System.out.println("Found " + collector.getTotalHits()
					+ " document(s) (in " + (end - start)
					+ " milliseconds) that matched query '" + keyword + "':");
		}
		catch (Exception e)
		{
			e.printStackTrace();
		}
	}

}

ruanzy888888

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
搜索引擎工具类

/** * 搜索引擎工具类 * @author ruanzhiyong6496 * @version 1.0 */public class Lucene{ private static String INDEX_DIR = "D:\\index";// 索引存放目录 private static String DATA_DIR = "D:\\small";// 小文...
复制链接

扫一扫