Lucene+Paoding完整实例

最新推荐文章于 2024-07-13 20:46:05 发布

yangjiao0714

最新推荐文章于 2024-07-13 20:46:05 发布

阅读量92

点赞数

文章标签： lucene Myeclipse JSP PHP Servlet

一个小小的搜索例子，实现对某个文件夹下的文件进行搜索

这里只有主要代码，整个project在附件中，导入到MyEclipse中时根据自己的情况修改配置文件中paoding-dic-home.properties的地址，当然，前提是你必须有庖丁解牛的字典，在页面搜索“项目”，会出现结果(基本每个文件中都有项目这个词)

附件中有项目T_Search，文件lucene\data，索引\lucene\index

MIndexer.java：创建索引(对文件进行创建，先把文件内容读取成String)

public class MIndexer {
	public void createIndex() {   
        long start = System.currentTimeMillis();   
        try {   
            // 获取Paoding中文分词器   
            Analyzer analyzer = new PaodingAnalyzer();    
            // indexWriter建立索引，E:\lucene\index建立索引的目录
            IndexWriter writer = new IndexWriter("E:\\lucene\\index", analyzer, true,IndexWriter.MaxFieldLength.UNLIMITED); 
          //E:\lucene\data建立索引的数据，主要是.txt、.pdf文件 
            indexDocs(writer, new File("E:\\lucene\\data")); 
            writer.optimize();   
            writer.close();   
            System.out.println("用时：" + (System.currentTimeMillis() - start) + " 毫秒");   
        } catch (IOException e) {   
            e.printStackTrace();   
        }   
    }   
    // 遍历文件夹文件，对需要的文件建立索引   
    static void indexDocs(IndexWriter writer, File file) throws IOException {   
        if (file.canRead()) {   
            if (file.isDirectory()) {   
                String[] files = file.list();   
                if (files != null) {   
                    for (int i = 0; i < files.length; i++) {   
                        indexDocs(writer, new File(file, files[i]));   
                    }   
                }   
            } else {   
                if (file.getName().endsWith(".htm")   
                        || file.getName().endsWith(".html")   
                        || file.getName().endsWith(".jsp")   
                        || file.getName().endsWith(".php")   
                        || file.getName().endsWith(".txt")
                        || file.getName().endsWith(".pdf")) {  
                    try {   
                        // 针对参数文件建立索引文档 ，一个Document就相当于一跳记录   
                        Document doc = new Document();   
                        // Field.Index.ANALYZED 文件名称 建立索引，分词   
                        doc.add(new Field("filename", file.getCanonicalPath(),   
                                Field.Store.YES, Field.Index.ANALYZED,   
                                Field.TermVector.WITH_POSITIONS_OFFSETS)); 
                        if(file.getName().endsWith(".pdf")){
                        	doc.add(new Field("contents", pdf2txt(file),   
                                    Field.Store.YES, Field.Index.ANALYZED,   
                                    Field.TermVector.WITH_POSITIONS_OFFSETS));
                        }else {
                        	doc.add(new Field("contents", ReadFile(file),   
                                    Field.Store.YES, Field.Index.ANALYZED,   
                                    Field.TermVector.WITH_POSITIONS_OFFSETS));
						}  
                        
                        writer.addDocument(doc);   
                    } catch (FileNotFoundException fnfe) {   
                        ;   
                    }   
                }   
            }   
        }   
    }   
  
    // 用字符串形式，读取一个File的内容   
    public static String ReadFile(File f) {   
        String line = null;   
        StringBuffer temp = new StringBuffer();   
        try {   
            BufferedReader br = new BufferedReader(new InputStreamReader(   
                    new FileInputStream(f), "UTF-8"));   
            while ((line = br.readLine()) != null) {   
                temp.append(line);   
            }   
        } catch (FileNotFoundException e) {   
            e.printStackTrace();   
        } catch (IOException e) {   
            e.printStackTrace();   
        }   
        return temp.toString();   
    }   
    //若文件为pdf，就用这个读取
    public static String pdf2txt(File pfile) {
		String _content = "";
		if (pfile.exists() && pfile.getName().lastIndexOf(".pdf") >= 1) {
			String textFile = String.format("%s%s%s%s%s.txt",
					pfile.getPath().substring(0,
							pfile.getPath().lastIndexOf(pfile.getName())),
					System.getProperty("file.separator"), "temp", System
							.getProperty("file.separator"), pfile.getName()
							.substring(0, pfile.getName().lastIndexOf(".pdf")));
			if (!new File(textFile.substring(0, textFile.lastIndexOf(new File(
					textFile).getName()))).exists()) {
				new File(textFile.substring(0, textFile.lastIndexOf(new File(
						textFile).getName()))).mkdirs();
			}

			PDDocument pdDoc = null;
			COSDocument cosDoc = null;
			try {
				pdDoc = PDDocument.load(pfile);
				PDFParser parser = new PDFParser(new FileInputStream(pfile));
				parser.parse();
				cosDoc = parser.getDocument();
				PDFTextStripper stripper = new PDFTextStripper();
				_content = stripper.getText(new PDDocument(cosDoc));
			} catch (IOException e) {
				e.printStackTrace();
			} finally {
				try {
					cosDoc.close();
					pdDoc.close();
					if (new File(textFile).exists()) {
						new File(textFile).delete();
					}
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
		}
		return _content;
	}
}

MSearcher.java：搜索，返回符合条件的List

public class MSearcher {
	public List<MBean> searchIndex(String keyword, boolean highlight,
			int content_length, int start, int length) {
		String indexpath = "E:\\lucene\\index"; // 索引所在目录

		List<MBean> mList = new ArrayList<MBean>();
		if (indexpath != null && new File(indexpath).exists()
				&& keyword != null && !keyword.trim().equals("") && length > 0) {
			start = (start > 0) ? start : 1;
			String[] FIELD = { "filename", "contents" };
			// 获取Paoding中文分词器
			Analyzer analyzer = new PaodingAnalyzer();
			FSDirectory directory;
			IndexReader reader;
			Searcher searcher;
			try {
				directory = FSDirectory.getDirectory(indexpath);
				reader = IndexReader.open(directory);
				String queryString = keyword;
				/*
				 * 下面这个表示要同时搜索这两个域，而且只要一个域里面有满足我们搜索的内容就行 SHOULD表示查询条件为or
				 * MUST表示查询条件为and MUST_NOT表示查询条件为not
				 */
				BooleanClause.Occur[] flags = new BooleanClause.Occur[] {
						BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD };
				Query query = MultiFieldQueryParser.parse(queryString, FIELD,
						flags, analyzer);

				searcher = new IndexSearcher(directory);
				query = query.rewrite(reader);
				//分页，取出前start + length - 1条数据
				TopDocCollector collector = new TopDocCollector(start + length - 1);
				searcher.search(query, collector);
				ScoreDoc[] hits = collector.topDocs().scoreDocs;
				BoldFormatter formatter = new BoldFormatter();
				Highlighter highlighter = new Highlighter(formatter,
						new QueryScorer(query));
				highlighter.setTextFragmenter(new SimpleFragmenter(
						content_length));
				for (int i = start - 1; i < hits.length; i++) {
					MBean mBean = new MBean();
					Document doc = searcher.doc(hits[i].doc);
					String _filename = doc.get(FIELD[0]);
					String _contents = doc.get(FIELD[1]);
					int maxNumFragmentsRequired = 5;
					String fragmentSeparator = "...";
					TermPositionVector tpv_filename = (TermPositionVector) reader
							.getTermFreqVector(hits[i].doc, FIELD[0]);
					TermPositionVector tpv_contents = (TermPositionVector) reader
							.getTermFreqVector(hits[i].doc, FIELD[1]);
					String high_filename = "";
					String high_contents = "";
					if (tpv_filename != null) {
						TokenStream token_filename = TokenSources
								.getTokenStream(tpv_filename);
						high_filename = highlighter.getBestFragments(
								token_filename, _filename,
								maxNumFragmentsRequired, fragmentSeparator);
					}
					if (tpv_contents != null) {
						TokenStream token_contents = TokenSources
								.getTokenStream(tpv_contents);
						high_contents = highlighter.getBestFragments(
								token_contents, _contents,
								maxNumFragmentsRequired, fragmentSeparator);
					}
					mBean.setFilename((high_filename != null && !high_filename
							.equals("")) ? high_filename : _filename);
					mBean.setContents((high_contents != null && !high_contents
							.equals("")) ? high_contents
							: (_contents.length() > content_length ? _contents
									.substring(0, content_length) : _contents));
					mList.add(mBean);
				}

				searcher.close();
				reader.close();
			} catch (ParseException e) {
				e.printStackTrace();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
		return mList;
	}

	public Integer searchIndexLength(String keyword, boolean highlight,
			int content_length, int start, int length, int maxLength) {
		int _count = 0;
		String indexpath = "E:\\lucene\\index";
		if (indexpath != null && new File(indexpath).exists()
				&& keyword != null && !keyword.trim().equals("") && length > 0) {
			start = (start > 0) ? start : 1;
			String[] FIELD = { "filename", "contents" };
			Analyzer analyzer = new PaodingAnalyzer();
			FSDirectory directory;
			IndexReader reader;
			Searcher searcher;
			try {
				directory = FSDirectory.getDirectory(indexpath);
				reader = IndexReader.open(directory);
				String queryString = keyword;
				BooleanClause.Occur[] flags = new BooleanClause.Occur[] {
						BooleanClause.Occur.SHOULD,
						BooleanClause.Occur.SHOULD };
				Query query = MultiFieldQueryParser.parse(queryString, FIELD,
						flags, analyzer);

				searcher = new IndexSearcher(reader);
				query = query.rewrite(reader);

				TopDocCollector collector = new TopDocCollector(maxLength);
				searcher.search(query, collector);
				ScoreDoc[] hits = collector.topDocs().scoreDocs;
				_count = hits.length;
				searcher.close();
				reader.close();
			} catch (ParseException e) {
				e.printStackTrace();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
		return _count;
	}

}

Search.java：处理用户请求的Servlet

public class Search extends HttpServlet {
	private static final Integer NUMBER = 10;//每页显示10条
	private static final Integer CONTENT_LENGTH = 50;
	private static final Boolean HIGHLIGHT = true;
	private static final long serialVersionUID = 1L;
	private MSearcher mSearcher = new MSearcher();

	@Override
	public void doPost(HttpServletRequest request, HttpServletResponse response)
			throws ServletException, IOException {
		request.setCharacterEncoding("UTF-8");
		String q = request.getParameter("q") != null ? request
				.getParameter("q").trim() : request.getParameter("q");
				System.out.println("----"+q);
		List<MBean> mList = new ArrayList<MBean>();
		List<PBean> pList = new ArrayList<PBean>();
		int start = request.getParameter("start")!= null ? Integer
				.valueOf(request.getParameter("start"))
				: 0;
		int all_count = 0;
		all_count = mSearcher.searchIndexLength( q, HIGHLIGHT,
				CONTENT_LENGTH, start, NUMBER, NUMBER * 1000);

		mList = mSearcher.searchIndex( q, HIGHLIGHT,
				CONTENT_LENGTH, start, NUMBER);
		
		pList = getPageList(all_count, start);
		if (start > NUMBER) {
			request.setAttribute("previous", start - NUMBER);
		}
		if (start < all_count - NUMBER) {
			request.setAttribute("next", NUMBER + (start != 0 ? start : 1));
		}
		request.setAttribute("q", q);
		request.setAttribute("start", start);
		request.setAttribute("pList", pList);
		request.setAttribute("mList", mList.isEmpty() ? null : mList);
		request.getRequestDispatcher("/index.jsp").forward(request, response);
	}

	@Override
	public void doGet(HttpServletRequest request, HttpServletResponse response)
			throws ServletException, IOException {
		doPost(request, response);
	}

	private static List<PBean> getPageList(int all_count, int start) {
		MIndexer mIndexer = new MIndexer();
		mIndexer.createIndex();
		List<PBean> pList = new ArrayList<PBean>();
		int all_page = (all_count <= 0) ? 1 : (all_count / NUMBER + (all_count
				% NUMBER > 0 ? 1 : 0));
		int now_page = (start <= 0) ? 1
				: (start / NUMBER + (start % NUMBER > 0 ? 1 : 0));
		for (int i = (now_page - 10 > 0 ? now_page - 10 : 1); i <= (((now_page + 9) <= all_page) ? (now_page + 9)
				: all_page); i++) {
			PBean pBean = new PBean();
			pBean.setPage(i);
			pBean.setStart((pBean.getPage() - 1) * NUMBER + 1);
			pList.add(pBean);
		}
		return pList;
	}
}

yangjiao0714

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Lucene+Paoding完整实例

一个小小的搜索例子，实现对某个文件夹下的文件进行搜索这里只有主要代码，整个project在附件中，导入到MyEclipse中时根据自己的情况修改配置文件中paoding-dic-home.properties的地址，当然，前提是你必须有庖丁解牛的字典，在页面搜索“项目”，会出现结果(基本每个文件中都有项目这个词)附件中有项目T_Search，文件lucene\data，索引\lucene\...
复制链接

扫一扫