Lucene中文索引简单实例

import java.io.File; 
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.text.DecimalFormat;
import jeasy.analysis.MMAnalyzer;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.Text;
import org.w3c.tidy.Tidy;

public class IndexHTMLTidy {
	static String root = "D:/html/zh_CN/api/java/lang";
	static String index = "D:/software/apache-tomcat-6.0.18/index_cn";
	
	static Document doc = null;
	static IndexWriter writer = null;

	public static void main(String[] args) throws Exception {
		writer = new IndexWriter(index, new MMAnalyzer(), true);
		File f = new File(root);
		indexDocs(f);
		writer.optimize();
		writer.close();
		System.out.println("ok...");
	}

	// 递归调用
	public static void indexDocs(File f) throws Exception {
		if (f.isDirectory()) {
			String file[] = f.list();
			for (int i = 0; i < file.length; i++) {
				indexDocs(new File(f, file[i]));
			}
		} else if (f.getName().endsWith(".html")) {
			indexDoc(f);
		}
	}

	// 索引一个文件
	public static void indexDoc(File f) throws Exception {
		doc = new Document();
		System.out.println(f.getPath());
		doc.add(new Field("path", f.getPath(), Field.Store.YES,Field.Index.NO));
		String url = f.getPath();
		url = url.replace("D:\\html\\zh_CN\\", "");
		url = url.replace('\\', '/');
		url = "http://127.0.0.1/" + url;
		doc.add(new Field("url", url, Field.Store.YES,Field.Index.NO));
		String size = new DecimalFormat("0000000000").format(f.length());
		doc.add(new Field("size", size, Field.Store.YES,Field.Index.UN_TOKENIZED));
		doc.add(new Field("lastmodified", DateTools.timeToString(f.lastModified(), DateTools.Resolution.DAY), Field.Store.YES,
				Field.Index.UN_TOKENIZED));

		Tidy tidy = new Tidy();
		tidy.setQuiet(true);
		tidy.setShowWarnings(false);
		// 乱码
		// org.w3c.dom.Document root = tidy.parseDOM(new FileInputStream(f),System.out);

		// 解决乱码问题
		// java.io.InputStream定义了抽象方法read(),从此输入流中读取一个数据字节。
		// java.io.FileInputStream实现了父类中定义的方法read()
		// public class InputStreamReader extends java.io.Reader
		// public class FileInputStream extends java.io.InputStream
		// InputStreamReader的方法read()-->读取单个字符。
		InputStreamReader ips = new InputStreamReader(new FileInputStream(f),"gb2312");
		// 适配器模式
		InputStream is = new ReaderToInputStream(ips);

		org.w3c.dom.Document root = tidy.parseDOM(is, null);
		// 得到根元素
		Element rawDoc = root.getDocumentElement();
		//得到title内容
		String title = getTitle(rawDoc);
		//得到body内容
		String body = getBody(rawDoc);
		
		System.out.println(title);        
		
		doc.add(new Field("title", title, Field.Store.YES,Field.Index.TOKENIZED));
		
		String summary = body;
		if (body.length() >= 200) {
			summary = body.substring(0, 200);
		}
		doc.add(new Field("summary", summary, Field.Store.YES,Field.Index.TOKENIZED));
		doc.add(new Field("content", body, Field.Store.YES,Field.Index.TOKENIZED));
		writer.addDocument(doc);
	}

	// 适配器
	public static class ReaderToInputStream extends InputStream {
		Reader reader;

		public ReaderToInputStream(Reader reader) {
			super();
			this.reader = reader;
		}

		@Override
		public int read() throws IOException {
			try {
				return reader.read();
			} catch (IOException e) {
				throw e;
			}
		}
	}

	// 得到title标签内容
	protected static String getTitle(Element rawDoc) {
		if (rawDoc == null) {
			return "";
		}
		String title = "";
		NodeList children = rawDoc.getElementsByTagName("title");
		if (children.getLength() > 0) {
			Element titleElement = ((Element) children.item(0));
			Text text = (Text) titleElement.getFirstChild();
			if (text != null) {
				title = text.getData();
			}
		}
		return title;
	}

	// 得到body标签内容
	protected static String getBody(Element rawDoc) {
		if (rawDoc == null) {
			return "";
		}
		String body = "";
		NodeList children = rawDoc.getElementsByTagName("body");
		if (children.getLength() > 0) {
			body = getText(children.item(0));
		}
		return body;
	}

	// 递归调用,因为标签里面还有标签
	protected static String getText(Node node) {
		NodeList children = node.getChildNodes();
		StringBuffer sb = new StringBuffer();
		for (int i = 0; i < children.getLength(); i++) {
			Node child = children.item(i);
			switch (child.getNodeType()) {
			case Node.ELEMENT_NODE:
				sb.append(getText(child));
				sb.append(" ");
				break;
			case Node.TEXT_NODE:
				sb.append(((Text) child).getData());
				break;
			}
		}
		return sb.toString();
	}
}

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值