给你一个参考,htmlparser
package com.lenxeon.extjs.collector.paraser;
import com.lenxeon.extjs.collector.utils.FilterUtils;
import com.lenxeon.extjs.resource.bean.Article;
import com.lenxeon.utils.cache.RedisCached;
import com.lenxeon.utils.httpclient.HttpClientUtils;
import com.lenxeon.utils.io.JsonUtils;
import org.apache.commons.collections.MapUtils;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.Div;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.TitleTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.NodeVisitor;
import org.springframework.web.util.HtmlUtils;
import redis.clients.jedis.Jedis;
import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class RssParaser extends Paraser {
public Article article = new Article();
public String domain = null;
public static Jedis jedis = RedisCached.getJedis();
public static String urlqueue = "paraser:url:queue";
public static String urlstore = "paraser:url:store";
public static String urlstoreProcessed = "paraser:url:processed";
public NodeList parseNodeList(String data) {
Parser parser = Parser.createParser(data, "utf-8");
final StringBuffer _title = new StringBuffer();
final NodeList nodeList = new NodeList();
try {
NodeVisitor visitor = new NodeVisitor() {
public void visitTag(Tag tag) {
nodeList.add(tag);
if (tag instanceof TitleTag) {
String title = ((TitleTag) tag).getTitle();
if (!StringUtils.isBlank(title)) {
_title.append(title);
}
}
}
};
parser.visitAllNodesWith(visitor);
article.setTitle(_title.toString());
} catch (ParserException e) {
e.printStackTrace();
}
return nodeList;
}
/**
* parase all url in page
*/
public List paraseUrl(NodeList nodelist, Map urlFilter) {
List list = new ArrayList();
NodeFilter filter = new TagNameFilter("A");
nodelist = nodelist.extractAllNodesThatMatch(filter, true);
for (int i = 0; i < nodelist.size(); i++) {
LinkTag link = (LinkTag) nodelist.elementAt(i);
String href = link.getAttribute("href");
href = processUrl(href);
if (FilterUtils.filter(href, urlFilter)) {
list.add(href);
}
}
return list;
}
public String processUrl(String href) {
if (!StringUtils.isBlank(href)) {
if (href.startsWith("http://")) {
//do nothing
} else if (href.startsWith("/")) {
href = domain + href;
} else {
href = domain + "/" + href;
}
}
return href;
}
public static void main(String args[]) {
try {
String rule_data = FileUtils.readFileToString(new File("D:\\studio\\apps\\src\\main\\java\\com\\lenxeon\\extjs\\collector\\sites\\hdz8.cn.json"));
ParaserInfo paraserInfo = new ParaserInfo();
paraserInfo.setClasspath(RssParaser.class.getName());
paraserInfo.setRules((Map) JsonUtils.toBean(rule_data, Map.class));
paraserInfo.setUrl("http://www.hdz8.cn");
new RssParaser().parase(paraserInfo);
} catch (Exception e) {
e.printStackTrace();
}
System.exit(0);
}
public void process(String url, Map rules) {
try {
Map config = MapUtils.getMap(rules, "site");
if (!rules.containsKey("site")) {
return;
}
//开始解析
domain = MapUtils.getString(config, "domain");
String encoding = MapUtils.getString(config, "encoding");
String startUrl = MapUtils.getString(config, "startUrl");
int interval = MapUtils.getIntValue(config, "interval");
Thread.sleep(interval);
if (StringUtils.isBlank(url)) {
url = startUrl;
}
System.out.println("working with url ==== " + url);
String html = HttpClientUtils.getString(url, encoding);
if (StringUtils.isBlank(html)) {
return;
}
NodeList nodeList = parseNodeList(html);
//过滤url参数
processUrl(nodeList, rules);
//处理标题
String title = processTitle(rules);
//处理分类
String sort = processSort(nodeList, rules);
//处理内容
String content = processContent(nodeList, rules);
List list = processPage(nodeList, rules);
if (list != null && list.size() > 0) {
for (String[] data : list) {
if (data != null || data.length == 2 || StringUtils.isNotBlank(data[1])) {
html = HttpClientUtils.getString(processUrl(data[1]), encoding);
if (StringUtils.isBlank(html)) {
return;
}
nodeList = parseNodeList(html);
String temp = processContent(nodeList, rules);
if (StringUtils.isBlank(temp)) {
return;
}
content = content + "\r\n" + temp;
}
}
}
content = content.replaceAll(title, "");
content = FilterUtils.beautifulText(content);
if (content == null) {
System.out.println("数据不合法,丢弃");
return;
}
// System.out.println("-------------------------------------------------------------------------\r\n"
// + content
// + "\n\r-------------------------------------------------------------------------");
// content = FilterUtils.processContent(content);
Map article = new HashMap();
article.put("title", title);
article.put("content", content);
article.put("id", "");
article.put("sortNames", "新文章." + sort);
article.put("writer", "admin");
article.put("keywords", "");
article.put("url", url);
// System.out.println(JsonUtils.toJson(article));
//XStream xstream = null;
//try {
//xstream = new XStream();
xstream = new XStream(new DomDriver()); // 需要xpp3 jar
//} catch (Exception e) {
//e.printStackTrace();
//}
//xstream.registerConverter(new XMLConvert());
//String xml = xstream.toXML(article);
Map form = new HashMap();
// form.put("action", "ins_article");
// form.put("api_key", "213a7766-c252-40dc-bb39-94b238ad9206");
// form.put("data", JsonUtils.toJson(article));
// String result = HttpClientUtils.postString("http://localhost/dedecms/dede/service.php", form);
// String result = HttpClientUtils.postString("http://www.ileshan.net/ctrl/service.php", form);
form.put("data", JsonUtils.toJson(article));
String result = HttpClientUtils.postString("http://localhost:8088/apps/api/article.json?method=push", form);
// String result = HttpClientUtils.postString("http://zendlab.com:8080/apps/api/article.json?method=push", form);
System.out.println("result===========" + result);
} catch (Exception e) {
e.printStackTrace();
} finally {
// System.exit(0);
}
}
private List processPage(NodeList nodeList, Map rules) {
Map config = MapUtils.getMap(rules, "site");
//处理分页数据
Map pagesCfg = (Map) MapUtils.getObject(config, "pagesfilter");
String attr = MapUtils.getString(pagesCfg, "attr");
String identify = MapUtils.getString(pagesCfg, "identify", "");
List> list = (List>) MapUtils.getObject(pagesCfg, "rule");
List pages = null;
for (int i = 0; i < nodeList.size(); i++) {
Node node = nodeList.elementAt(i);
if (node instanceof Div) {
Div div = (Div) node;
String attrStr = div.getAttribute(attr);
if (!StringUtils.isBlank(attrStr) && attrStr.contains(identify)) {
pages = FilterUtils.getTagA(FilterUtils.prepare(div.getChildrenHTML(), list));
break;
}
}
}
System.out.println("pages:" + JsonUtils.toJson(pages));
return pages;
}
private String processContent(NodeList nodeList, Map rules) {
Map config = MapUtils.getMap(rules, "site");
Map contentCfg = (Map) MapUtils.getObject(config, "contentfilter");
String attr = MapUtils.getString(contentCfg, "attr");
String identify = MapUtils.getString(contentCfg, "identify", "");
String content = null;
for (int i = 0; i < nodeList.size(); i++) {
Node node = nodeList.elementAt(i);
if (node instanceof Div) {
Div link = (Div) node;
String attrStr = link.getAttribute(attr);
if (!StringUtils.isBlank(attrStr) && attrStr.contains(identify)) {
content = link.getChildrenHTML();
break;
}
}
}
content = HtmlUtils.htmlUnescape(content);
List> list = (List>) MapUtils.getObject(contentCfg, "rule");
content = FilterUtils.prepare(content, list);
content = FilterUtils.beautifulText(content);
return content;
}
private String processSort(NodeList nodeList, Map rules) {
Map config = MapUtils.getMap(rules, "site");
String content = null;
Map sortCfg = (Map) MapUtils.getObject(config, "sortfilter");
String attr = MapUtils.getString(sortCfg, "attr");
String identify = MapUtils.getString(sortCfg, "identify", "");
for (int i = 0; i < nodeList.size(); i++) {
Node node = nodeList.elementAt(i);
if (node instanceof Div) {
Div link = (Div) node;
String attrStr = link.getAttribute(attr);
if (!StringUtils.isBlank(attrStr) && attrStr.contains(identify)) {
content = link.getChildrenHTML();
break;
}
}
}
String sortName = "未分类";
List links = FilterUtils.getTagA(content);
int idx = MapUtils.getIntValue(sortCfg, "idx", -1);
String orderby = MapUtils.getString(sortCfg, "orderby");
if (idx >= 0 && links != null && links.size() > idx) {
if (StringUtils.endsWith("desc", orderby)) {
List _links = new ArrayList();
for (String[] item : links) {
_links.add(0, item);
}
links = _links;
}
sortName = links.get(idx)[0];
}
System.out.println("分类content=" + sortName);
return sortName;
}
private String processTitle(Map rules) {
Map config = MapUtils.getMap(rules, "site");
List> list = (List>) MapUtils.getObject(config, "titlefilter");
String title = FilterUtils.prepare(article.getTitle(), list);
this.article.setTitle(title);
return title;
}
public void processUrl(NodeList nodeList, Map rules) {
Map config = MapUtils.getMap(rules, "site");
//过滤url参数
Map urlFilter = new HashMap();
List> list = (List>) MapUtils.getObject(config, "urlfilter");
for (Map m : list) {
String role = MapUtils.getString(m, "role");
String text = MapUtils.getString(m, "text");
urlFilter.put(role, text);
}
List urls = paraseUrl(nodeList, urlFilter);
for (String u : urls) {
if (!StringUtils.isBlank(u) && !jedis.sismember(urlstore, u)) {
jedis.lpush(urlqueue, u);
jedis.sadd(urlstore, u);
}
}
}
@Override
protected void beforeParase(ParaserInfo paraserInfo) {
//To change body of implemented methods use File | Settings | File Templates.
}
public void parase(ParaserInfo paraserInfo) {
Map rules = paraserInfo.getRules();
Map config = MapUtils.getMap(rules, "site");
if (!rules.containsKey("site")) {
return;
}
//开始解析
domain = MapUtils.getString(config, "domain");
String url = MapUtils.getString(config, "startUrl");
urlqueue += domain;
urlstore += domain;
urlstoreProcessed += domain;
// process(url, paraserInfo.getRules());
jedis.lpush(urlqueue, url);
while ((url = jedis.rpop(urlqueue)) != null) {
System.out.println("url:" + url);
if (jedis.sadd(urlstoreProcessed, url) == 1) {
try {
process(url, paraserInfo.getRules());
} catch (Throwable e) {
System.err.println("出错啦...");
}
}
}
}
@Override
protected void afterParase(ParaserInfo paraserInfo) {
//To change body of implemented methods use File | Settings | File Templates.
}
}