package com.htmlparser;
import java.net.URL;
import java.util.HashSet;
import java.util.Set;
import org.htmlparser.NodeFilter;
import org.htmlparser.beans.LinkBean;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.Parser;
import org.htmlparser.beans.StringBean;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.tags.Div;
import org.htmlparser.tags.HeadingTag;
import org.htmlparser.tags.Span;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
/*
* Author sanshang
*/
public class ParseNews {
private Parser parser = null; // 用于分析网页的分析器。
private Set<String> links = new HashSet<String>();
/*
* 测试LinkBean的用法
*/
public Set<String> testLinkBean() {
LinkBean linkBean = new LinkBean();
linkBean.setURL("http://finance.sina.com.cn/stock/");
URL[] urls = linkBean.getLinks();
for (int i = 0; i < urls.length; i++) {
URL url = urls[i];
if (accept(url.toString())) {
// System.out.println("url is :" + url);
links.add(url.toString());
}
}
return links;
}
public boolean accept(String url) {
if (url
.matches("http://finance.sina.com.cn/stock/gujiayidong/20090526/[\\d]+.shtml")) {
return true;
} else {
return false;
}
}
/*
* 得到标题
*/
private String getTitle(NodeFilter titleFilter, Parser parser) {
String titleName = "";
try {
NodeList titleNodeList = (NodeList) parser.parse(titleFilter);
for (int i = 0; i < titleNodeList.size(); i++) {
HeadingTag title = (HeadingTag) titleNodeList.elementAt(i);
titleName = title.getStringText();
}
} catch (ParserException ex) {
}
return titleName;
}
/*
* 得到新聞編輯
*/
private String getNewsAuthor(NodeFilter newsauthorFilter, Parser parser) {
String newsAuthor = "";
try {
NodeList authorList = (NodeList) parser.parse(newsauthorFilter);
for (int i = 0; i < authorList.size(); i++) {
Span authorSpan = (Span) authorList.elementAt(i);
newsAuthor = authorSpan.getStringText();
}
} catch (ParserException ex) {
}
return newsAuthor;
}
/*
* 获得新闻的日期
*/
private String getNewsDate(NodeFilter dateFilter, Parser parser) {
String newsDate = null;
try {
NodeList dateList = (NodeList) parser.parse(dateFilter);
for (int i = 0; i < dateList.size(); i++) {
Span dateTag = (Span) dateList.elementAt(i);
newsDate = dateTag.getStringText();
}
} catch (ParserException ex) {
}
return newsDate;
}
/**
* 获取新闻的内容
*/
private String getNewsContent(NodeFilter newsContentFilter, Parser parser) {
String content = null;
StringBuilder builder = new StringBuilder();
try {
NodeList newsContentList = (NodeList) parser
.parse(newsContentFilter);
for (int i = 0; i < newsContentList.size(); i++) {
Div newsContenTag = (Div) newsContentList.elementAt(i);
builder = builder.append(newsContenTag.getStringText());
}
content = builder.toString(); // 转换为String 类型。
if (content != null) {
parser.reset();
parser = Parser.createParser(content, "gb2312");
StringBean sb = new StringBean();
sb.setCollapse(true);
parser.visitAllNodesWith(sb);
content = sb.getStrings();
// content = content.replaceAll("\\\".*[a-z].*\\}", "");
content = content.replace("已有_COUNT_位网友发表评论 我要评论", "");
content = content
.replace(
"新浪声明:此消息系转载自新浪合作媒体,新浪网登载此文出于传递更多信息之目的,并不意味着赞同其观点或证实其描述。文章内容仅供参考,不构成投资建议。投资者据此操作,风险自担。",
"");
content = content.replace("以下是本文可能影响或涉及到的板块个股:", "");
content = content
.replace(
"新浪声明:新浪网登载此文出于传递更多信息之目的,并不意味着赞同其观点或证实其描述。文章内容仅供参考,不构成投资建议。投资者据此操作,风险自担。",
"");
} else {
System.out.println("没有得到新闻内容!");
}
} catch (ParserException ex) {
}
return content;
}
public void parser(String url) {
try {
parser = new Parser(url);
// NodeFilter titleFilter = new TagNameFilter("h1");
// 标题Filter
NodeFilter titleFilter = new AndFilter(new TagNameFilter("h1"),
new HasAttributeFilter("id", "artibodyTitle"));
// 内容Filter
NodeFilter contentFilter = new AndFilter(new TagNameFilter("div"),
new HasAttributeFilter("id", "artibody"));
// 日期Filter
NodeFilter newsdateFilter = new AndFilter(
new TagNameFilter("span"), new HasAttributeFilter("id",
"pub_date"));
// 作者Filter
NodeFilter newsauthorFilter = new AndFilter(new TagNameFilter(
"span"), new HasAttributeFilter("id", "media_name"));
String newsTitle = getTitle(titleFilter, parser);
System.out.println(newsTitle);
parser.reset(); // 记得每次用完parser后,要重置一次parser。要不然就得不到我们想要的内容了。
String newsContent = getNewsContent(contentFilter, parser);
System.out.println(newsContent); // 输出新闻的内容,查看是否符合要求
parser.reset();
String newsDate = getNewsDate(newsdateFilter, parser);
System.out.println(newsDate);
parser.reset();
String newsauthor = getNewsAuthor(newsauthorFilter, parser);
System.out.println(newsauthor);
System.out
.println("------------------------------------------------------------");
} catch (ParserException ex) {
}
}
public static void main(String[] args) {
ParseNews parseNews = new ParseNews();
// parseNews.testLinkBean();
parseNews.links = parseNews.testLinkBean();
for (String o : parseNews.links) {
parseNews.parser(o);
}
}
}
[quote]备注:matches方法的第二个参数是正则。当URL中含有“?”时,记得要用正则表达:[?][/quote]