利用lucene,nekohtml,为rss新闻建立索引


import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;

import javax.swing.text.html.HTML.Tag;

import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.Version;
import org.apache.xerces.impl.xpath.XPath;

import org.htmlparser.Node;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.TextExtractingVisitor;
import org.w3c.dom.Text;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

import com.sun.syndication.feed.synd.SyndCategory;
import com.sun.syndication.feed.synd.SyndContent;
import com.sun.syndication.feed.synd.SyndEnclosure;
import com.sun.syndication.feed.synd.SyndEntry;
import com.sun.syndication.feed.synd.SyndFeed;
import com.sun.syndication.io.SyndFeedInput;
import com.sun.syndication.io.XmlReader;

import org.cyberneko.html.parsers.DOMParser;

public class TestParse {

public void parseRss() {
ArrayList<String> feeds = new ArrayList<String>();
feeds.add("http://news.baidu.com/n?cmd=1&class=civilnews&tn=rss&sub=0");
feeds.add("http://news.baidu.com/n?cmd=1&class=rwdt&tn=rss&sub=0");
feeds.add("http://news.baidu.com/n?cmd=1&class=mil&tn=rss&sub=0");
feeds.add("http://news.baidu.com/n?cmd=1&class=finannews&tn=rss&sub=0");
feeds.add("http://rss.sina.com.cn/news/marquee/ddt.xml");
try {
IndexWriter indexwriter = new IndexWriter(FSDirectory
.open(new File("d://htmls")), new SmartChineseAnalyzer(
Version.LUCENE_29), true, MaxFieldLength.UNLIMITED);

for (String rss : feeds) {
URL url = new URL(rss);
// 读取Rss源
XmlReader reader = new XmlReader(url);
System.out.println("Rss源的编码格式为:" + reader.getEncoding());
SyndFeedInput input = new SyndFeedInput();
// 得到SyndFeed对象,即得到Rss源里的所有信息
SyndFeed feed = input.build(reader);
// 得到Rss新闻中子项列表
List entries = feed.getEntries();
// 循环得到每个子项信息
for (int i = 0; i < entries.size(); i++) {
org.apache.lucene.document.Document doc = new Document();
SyndEntry entry = (SyndEntry) entries.get(i);
// 标题、连接地址、标题简介、时间是一个Rss源项最基本的组成部分
System.out.println("标题:" + entry.getTitle());
org.apache.lucene.document.Field titleField = new Field(
"title", entry.getTitle(), Store.YES,
Index.ANALYZED);
doc.add(titleField);
System.out.println("连接地址:" + entry.getLink());
Field urlField = new Field("url", entry.getLink(),
Store.YES, Index.NO);
doc.add(urlField);
try {
String content = getContentByNeko(entry.getLink(), reader
.getEncoding());
System.out.println(content);
Field contentField = new Field("content", content,
Store.YES, Index.ANALYZED);
doc.add(contentField);
} catch (Exception e) {
e.printStackTrace();
}
SyndContent description = entry.getDescription();
Field desField = new Field("description", description
.getValue(), Store.YES, Index.ANALYZED);
doc.add(desField);
// System.out.println("标题简介:" + description.getValue());
// System.out.println("发布时间:" + entry.getPublishedDate());
indexwriter.addDocument(doc);

}
}
indexwriter.optimize();
indexwriter.close();
} catch (Exception e) {
e.printStackTrace();
}
}

private String getContent(String url, String encoding) {
// TODO Auto-generated method stub

try {

Parser parser = new Parser(url);
parser.setEncoding(encoding);
// TextExtractingVisitor visitor = new TextExtractingVisitor();
// visitor.visitStringNode(TAG.);
// parser.visitAllNodesWith(visitor);
// return visitor.getExtractedText();

// NodeList nodes = parser.extractAllNodesThatMatch(new
// NodeClassFilter(TextNode.class));
NodeList nodes = parser
.extractAllNodesThatMatch(new NodeClassFilter(
org.htmlparser.tags.ParagraphTag.class));
if (nodes == null)
return "";
StringBuffer sb = new StringBuffer();
for (int i = 0; i < nodes.size(); i++) {
org.htmlparser.nodes.TagNode textnode = (TagNode) nodes
.elementAt(i);
String line = textnode.toPlainTextString().trim();
/*
* if (line.equals("")) continue; boolean ischinese=false; int
* count=0; for(int idx=0;idx<line.length();idx++){ char u =
* line.charAt(idx); if((u>='\u4E00' && u<='\u9FA5') ||
* (u>='\uF900'&& u<='\uFA2D')){ count++; } }
* if(count>line.length()0.1)
*/
sb.append(line);
}
// sb=delTag("script",sb);
return sb.toString();
} catch (ParserException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return null;
}

public static void main(String[] args) {
new TestParse().parseRss();
}

private String getContentByNeko(String url, String encoding) {
StringBuilder sb = new StringBuilder();
DOMParser parser = new DOMParser();

try {
parser.setFeature("http://xml.org/sax/features/namespaces", false);
BufferedReader in = new BufferedReader(new InputStreamReader(
new URL(url).openStream(), encoding));
parser.parse(new InputSource(in));
in.close();
org.w3c.dom.Document doc = parser.getDocument();
org.w3c.dom.NodeList products = org.apache.xpath.XPathAPI
.selectNodeList(doc, "//P");
org.w3c.dom.Node node = null;
for (int i = 0; i < products.getLength(); i++) {
node = products.item(i);
System.out.println(i + ":\n" + node.getTextContent());
}
} catch (Exception e) {
e.printStackTrace();
}

return sb.toString();
}
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值