lucene搜索html文件,使用Lucene对html文件进行索引

最新推荐文章于 2021-06-19 12:22:05 发布

买报纸的小女孩

最新推荐文章于 2021-06-19 12:22:05 发布

阅读量237

点赞数

文章标签： lucene搜索html文件

我修改了lucene的demo包的IndexHTML类，使其可以被其他Java类调用。

IndexHTML类

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.Term;

import org.apache.lucene.index.TermEnum;

import java.io.File;

import java.util.Date;

import java.util.Arrays;

//还需调用demo的其他类。

import org.apache.lucene.demo;

/**

* Create html file index for searching

* @author tyrone

public class IndexHTML {

private String DocsPath=null;

/**

* the path for index file;

private String IndexFilePath=null;

/**

* true during deletion pass

private boolean deleting = false;

/**

* existing index

private IndexReader reader;

/**

* new index being built

private IndexWriter writer;

/**

* document id iterator

private TermEnum uidIter;

private void indexDocs(File file)throws Exception {

if (file.isDirectory()) { // if a directory

String[] files = file.list(); // list its files

Arrays.sort(files); // sort the files

for (int i = 0; i < files.length; i++) // recursively index them

this.indexDocs(new File(file, files[i]));

} else if (file.getPath().endsWith(".html") || // index .html files

file.getPath().endsWith(".htm") || // index .htm files

file.getPath().endsWith(".txt")) { // index .txt files

if (this.uidIter != null) {

String uid = HTMLDocument.uid(file); // construct uid for doc

while (uidIter.term() != null && uidIter.term().field() == "uid" &&

uidIter.term().text().compareTo(uid) < 0) {

if (deleting) { // delete stale docs

System.out.println("deleting " +

HTMLDocument.uid2url(uidIter.term().text()));

reader.delete(uidIter.term());

}

uidIter.next();

}

if (uidIter.term() != null && uidIter.term().field() == "uid" &&

uidIter.term().text().compareTo(uid) == 0) {

uidIter.next(); // keep matching docs

} else if (!deleting) { // add new docs

Document doc = HTMLDocument.Document(file);

System.out.println("adding " + doc.get("url"));

writer.addDocument(doc);

}

} else { // creating a new index

Document doc = HTMLDocument.Document(file);

System.out.println("adding " + doc.get("url"));

writer.addDocument(doc); // add docs unconditionally

}

return;

}

/**

* Walk directory hierarchy in uid order, while keeping uid iterator from

* existing index in sync. Mismatches indicate one of:

* (a) old documents to be deleted;

* (b) unchanged documents, to be left alone;

* or (c) new documents, to be indexed.

private void indexDocs(File file, String index, boolean create)

throws Exception {

if (!create) { // incrementally update

reader = IndexReader.open(index); // open existing index

uidIter = reader.terms(new Term("uid", "")); // init uid iterator

this.indexDocs(file);

if (deleting) { // delete rest of stale docs

while (uidIter.term() != null && uidIter.term().field() == "uid") {

System.out.println("deleting " +

HTMLDocument.uid2url(uidIter.term().text()));

reader.delete(uidIter.term());

uidIter.next();

}

deleting = false;

}

uidIter.close(); // close uid iterator

reader.close(); // close existing index

} else // don‘t have exisiting

this.indexDocs(file);

}

/**

* if create=true, create a new index, else refresh old index.

* @param create

public void run(boolean create) {

try {

String index = "index";

File root = null;

if (this.IndexFilePath!=null) { // index file path

index = this.IndexFilePath;

}

if (this.DocsPath==null){

System.out.println("root directory is not set");

return;

}

root = new File(this.DocsPath);

Date start = new Date();

/**

* not create then maintenance

if (!create) { // delete stale docs

this.deleting = true;

this.indexDocs(root, index, create);

}

writer = new IndexWriter(index, new StandardAnalyzer(), create);

writer.maxFieldLength = 1000000;

this.indexDocs(root, index, create); // add new docs

System.out.println("Optimizing index...");

writer.optimize();

writer.close();

Date end = new Date();

System.out.print(end.getTime() - start.getTime());

System.out.println(" total milliseconds");

} catch (Exception e) {

System.out.println(" caught a " + e.getClass() +

"\n with message: " + e.getMessage());

}

return;

}

/**

* @return Returns the IndexFilePath.

public String getIndexFilePath() {

return IndexFilePath;

}

/**

* @param IndexFilePath The IndexFilePath to set.

public void setIndexFilePath(String property1) {

this.IndexFilePath = property1;

}

/**

* @return Returns the DocsPath.

public String getDocsPath() {

return DocsPath;

}

/**

* @param DocsPath The DocsPath to set.

public void setDocsPath(String property1) {

this.DocsPath = property1;

}

/**

* test

* @param args

public static void main(String[] args){

IndexHTML ih=new IndexHTML();

ih.setDocsPath("D:\\MyProject\\colimas\\clms-doc2\\html");

ih.setIndexFilePath("D:\\MyProject\\colimas\\index");

ih.run(true);

}

运行后生成3个文件_3i8.cfs，deletable，segments

搜索文件类：

* Created on 2005/07/28

* TODO To change the template for this generated file go to

* Window - Preferences - Java - Code Style - Code Templates

package com.nova.colimas.search.query;

/**

* @author tyrone

* TODO To change the template for this generated type comment go to

* Window - Preferences - Java - Code Style - Code Templates

public class HitsHTMLDoc {

private String Title;

private String Path;

private String Url;

/**

* @return Returns the Url.

public String getUrl() {

return Url;

}

/**

* @param Url The Url to set.

public void setUrl(String property1) {

this.Url = property1;

}

/**

* @return Returns the Path.

public String getPath() {

return Path;

}

/**

* @param Path The Path to set.

public void setPath(String property1) {

this.Path = property1;

}

/**

* @return Returns the Title.

public String getTitle() {

return Title;

}

/**

* @param Title The Title to set.

public void setTitle(String property1) {

this.Title = property1;

}

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.search.Searcher;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.Hits;

import org.apache.lucene.queryParser.QueryParser;

/**

* @author tyrone

* TODO To change the template for this generated type comment go to

* Window - Preferences - Java - Code Style - Code Templates

public class SearchFiles {

private Hits hits;

public Hits getHits(){

return hits;

}

public HitsHTMLDoc[] run(String indexFilePath,String line){

HitsHTMLDoc[] hitdocs;

try {

Searcher searcher = new IndexSearcher(indexFilePath);

Analyzer analyzer = new StandardAnalyzer();

Query query = QueryParser.parse(line, "contents", analyzer);

System.out.println("Searching for: " + query.toString("contents"));

this.hits = searcher.search(query);

if (this.hits.length()==0) return null;

System.out.println(this.hits.length() + " total matching documents");

hitdocs=new HitsHTMLDoc[this.hits.length()];

for (int i = 0; i < hits.length(); i++) {

Document doc = this.hits.doc(i);

String path = doc.get("path");

if (path != null) {

hitdocs[i].setPath(path);

} else {

String url=doc.get("url");

if (url != null) {

hitdocs[i]=new HitsHTMLDoc();

hitdocs[i].setUrl(url);

String title=doc.get("title");

if (title!=null)

hitdocs[i].setTitle(title);

} else {

System.out.println(i + ". " + "No path nor URL for this document");

}

searcher.close();

return hitdocs;

}catch(Exception e){

System.out.println(" caught a " + e.getClass() +

"\n with message: " + e.getMessage());

}

return null;

}

/**

* test

* args=queries

* @author tyrone

public static void main(String[] args){

SearchFiles se=new SearchFiles();

String query="";

HitsHTMLDoc[] hitsdoc;

for (int i=0;i

query=query+args[i]+" ";

hitsdoc=se.run("D:\\MyProject\\colimas\\index",query);

if (hitsdoc==null){

System.out.println("nothing");

return;

}

for (int l=0;l

System.out.println("url:"+hitsdoc[l].getUrl());

System.out.println("path:"+hitsdoc[l].getPath());

System.out.println("title:"+hitsdoc[l].getTitle());

}

注意事项：

1 引用lucene debug你的应用程序时虽然不需要下面的jar包，但每次会提示URLClassPath.class异常，为方便起见还是下载这些jar包。

relaxngDatatype.jar

commons-beanutils.jar

commons-collections.jar

commons-digester.jar

commons-logging.jar

commons-validator.jar

jakarta-oro.jar

struts-legacy.jar

2 生成index文件的目录里不能有其他目录，如果有则会试图删除或报错

买报纸的小女孩

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫