java抓取网页css_java过滤抓取到的网页代码中css隐藏的文字和标签

给你一个参考,htmlparser

package com.lenxeon.extjs.collector.paraser;

import com.lenxeon.extjs.collector.utils.FilterUtils;

import com.lenxeon.extjs.resource.bean.Article;

import com.lenxeon.utils.cache.RedisCached;

import com.lenxeon.utils.httpclient.HttpClientUtils;

import com.lenxeon.utils.io.JsonUtils;

import org.apache.commons.collections.MapUtils;

import org.apache.commons.io.FileUtils;

import org.apache.commons.lang.StringUtils;

import org.htmlparser.Node;

import org.htmlparser.NodeFilter;

import org.htmlparser.Parser;

import org.htmlparser.Tag;

import org.htmlparser.filters.TagNameFilter;

import org.htmlparser.tags.Div;

import org.htmlparser.tags.LinkTag;

import org.htmlparser.tags.TitleTag;

import org.htmlparser.util.NodeList;

import org.htmlparser.util.ParserException;

import org.htmlparser.visitors.NodeVisitor;

import org.springframework.web.util.HtmlUtils;

import redis.clients.jedis.Jedis;

import java.io.File;

import java.util.ArrayList;

import java.util.HashMap;

import java.util.List;

import java.util.Map;

public class RssParaser extends Paraser {

public Article article = new Article();

public String domain = null;

public static Jedis jedis = RedisCached.getJedis();

public static String urlqueue = "paraser:url:queue";

public static String urlstore = "paraser:url:store";

public static String urlstoreProcessed = "paraser:url:processed";

public NodeList parseNodeList(String data) {

Parser parser = Parser.createParser(data, "utf-8");

final StringBuffer _title = new StringBuffer();

final NodeList nodeList = new NodeList();

try {

NodeVisitor visitor = new NodeVisitor() {

public void visitTag(Tag tag) {

nodeList.add(tag);

if (tag instanceof TitleTag) {

String title = ((TitleTag) tag).getTitle();

if (!StringUtils.isBlank(title)) {

_title.append(title);

}

}

}

};

parser.visitAllNodesWith(visitor);

article.setTitle(_title.toString());

} catch (ParserException e) {

e.printStackTrace();

}

return nodeList;

}

/**

* parase all url in page

*/

public List paraseUrl(NodeList nodelist, Map urlFilter) {

List list = new ArrayList();

NodeFilter filter = new TagNameFilter("A");

nodelist = nodelist.extractAllNodesThatMatch(filter, true);

for (int i = 0; i < nodelist.size(); i++) {

LinkTag link = (LinkTag) nodelist.elementAt(i);

String href = link.getAttribute("href");

href = processUrl(href);

if (FilterUtils.filter(href, urlFilter)) {

list.add(href);

}

}

return list;

}

public String processUrl(String href) {

if (!StringUtils.isBlank(href)) {

if (href.startsWith("http://")) {

//do nothing

} else if (href.startsWith("/")) {

href = domain + href;

} else {

href = domain + "/" + href;

}

}

return href;

}

public static void main(String args[]) {

try {

String rule_data = FileUtils.readFileToString(new File("D:\\studio\\apps\\src\\main\\java\\com\\lenxeon\\extjs\\collector\\sites\\hdz8.cn.json"));

ParaserInfo paraserInfo = new ParaserInfo();

paraserInfo.setClasspath(RssParaser.class.getName());

paraserInfo.setRules((Map) JsonUtils.toBean(rule_data, Map.class));

paraserInfo.setUrl("http://www.hdz8.cn");

new RssParaser().parase(paraserInfo);

} catch (Exception e) {

e.printStackTrace();

}

System.exit(0);

}

public void process(String url, Map rules) {

try {

Map config = MapUtils.getMap(rules, "site");

if (!rules.containsKey("site")) {

return;

}

//开始解析

domain = MapUtils.getString(config, "domain");

String encoding = MapUtils.getString(config, "encoding");

String startUrl = MapUtils.getString(config, "startUrl");

int interval = MapUtils.getIntValue(config, "interval");

Thread.sleep(interval);

if (StringUtils.isBlank(url)) {

url = startUrl;

}

System.out.println("working with url ==== " + url);

String html = HttpClientUtils.getString(url, encoding);

if (StringUtils.isBlank(html)) {

return;

}

NodeList nodeList = parseNodeList(html);

//过滤url参数

processUrl(nodeList, rules);

//处理标题

String title = processTitle(rules);

//处理分类

String sort = processSort(nodeList, rules);

//处理内容

String content = processContent(nodeList, rules);

List list = processPage(nodeList, rules);

if (list != null && list.size() > 0) {

for (String[] data : list) {

if (data != null || data.length == 2 || StringUtils.isNotBlank(data[1])) {

html = HttpClientUtils.getString(processUrl(data[1]), encoding);

if (StringUtils.isBlank(html)) {

return;

}

nodeList = parseNodeList(html);

String temp = processContent(nodeList, rules);

if (StringUtils.isBlank(temp)) {

return;

}

content = content + "\r\n" + temp;

}

}

}

content = content.replaceAll(title, "");

content = FilterUtils.beautifulText(content);

if (content == null) {

System.out.println("数据不合法,丢弃");

return;

}

// System.out.println("-------------------------------------------------------------------------\r\n"

// + content

// + "\n\r-------------------------------------------------------------------------");

// content = FilterUtils.processContent(content);

Map article = new HashMap();

article.put("title", title);

article.put("content", content);

article.put("id", "");

article.put("sortNames", "新文章." + sort);

article.put("writer", "admin");

article.put("keywords", "");

article.put("url", url);

// System.out.println(JsonUtils.toJson(article));

//XStream xstream = null;

//try {

//xstream = new XStream();

xstream = new XStream(new DomDriver()); // 需要xpp3 jar

//} catch (Exception e) {

//e.printStackTrace();

//}

//xstream.registerConverter(new XMLConvert());

//String xml = xstream.toXML(article);

Map form = new HashMap();

// form.put("action", "ins_article");

// form.put("api_key", "213a7766-c252-40dc-bb39-94b238ad9206");

// form.put("data", JsonUtils.toJson(article));

// String result = HttpClientUtils.postString("http://localhost/dedecms/dede/service.php", form);

// String result = HttpClientUtils.postString("http://www.ileshan.net/ctrl/service.php", form);

form.put("data", JsonUtils.toJson(article));

String result = HttpClientUtils.postString("http://localhost:8088/apps/api/article.json?method=push", form);

// String result = HttpClientUtils.postString("http://zendlab.com:8080/apps/api/article.json?method=push", form);

System.out.println("result===========" + result);

} catch (Exception e) {

e.printStackTrace();

} finally {

// System.exit(0);

}

}

private List processPage(NodeList nodeList, Map rules) {

Map config = MapUtils.getMap(rules, "site");

//处理分页数据

Map pagesCfg = (Map) MapUtils.getObject(config, "pagesfilter");

String attr = MapUtils.getString(pagesCfg, "attr");

String identify = MapUtils.getString(pagesCfg, "identify", "");

List> list = (List>) MapUtils.getObject(pagesCfg, "rule");

List pages = null;

for (int i = 0; i < nodeList.size(); i++) {

Node node = nodeList.elementAt(i);

if (node instanceof Div) {

Div div = (Div) node;

String attrStr = div.getAttribute(attr);

if (!StringUtils.isBlank(attrStr) && attrStr.contains(identify)) {

pages = FilterUtils.getTagA(FilterUtils.prepare(div.getChildrenHTML(), list));

break;

}

}

}

System.out.println("pages:" + JsonUtils.toJson(pages));

return pages;

}

private String processContent(NodeList nodeList, Map rules) {

Map config = MapUtils.getMap(rules, "site");

Map contentCfg = (Map) MapUtils.getObject(config, "contentfilter");

String attr = MapUtils.getString(contentCfg, "attr");

String identify = MapUtils.getString(contentCfg, "identify", "");

String content = null;

for (int i = 0; i < nodeList.size(); i++) {

Node node = nodeList.elementAt(i);

if (node instanceof Div) {

Div link = (Div) node;

String attrStr = link.getAttribute(attr);

if (!StringUtils.isBlank(attrStr) && attrStr.contains(identify)) {

content = link.getChildrenHTML();

break;

}

}

}

content = HtmlUtils.htmlUnescape(content);

List> list = (List>) MapUtils.getObject(contentCfg, "rule");

content = FilterUtils.prepare(content, list);

content = FilterUtils.beautifulText(content);

return content;

}

private String processSort(NodeList nodeList, Map rules) {

Map config = MapUtils.getMap(rules, "site");

String content = null;

Map sortCfg = (Map) MapUtils.getObject(config, "sortfilter");

String attr = MapUtils.getString(sortCfg, "attr");

String identify = MapUtils.getString(sortCfg, "identify", "");

for (int i = 0; i < nodeList.size(); i++) {

Node node = nodeList.elementAt(i);

if (node instanceof Div) {

Div link = (Div) node;

String attrStr = link.getAttribute(attr);

if (!StringUtils.isBlank(attrStr) && attrStr.contains(identify)) {

content = link.getChildrenHTML();

break;

}

}

}

String sortName = "未分类";

List links = FilterUtils.getTagA(content);

int idx = MapUtils.getIntValue(sortCfg, "idx", -1);

String orderby = MapUtils.getString(sortCfg, "orderby");

if (idx >= 0 && links != null && links.size() > idx) {

if (StringUtils.endsWith("desc", orderby)) {

List _links = new ArrayList();

for (String[] item : links) {

_links.add(0, item);

}

links = _links;

}

sortName = links.get(idx)[0];

}

System.out.println("分类content=" + sortName);

return sortName;

}

private String processTitle(Map rules) {

Map config = MapUtils.getMap(rules, "site");

List> list = (List>) MapUtils.getObject(config, "titlefilter");

String title = FilterUtils.prepare(article.getTitle(), list);

this.article.setTitle(title);

return title;

}

public void processUrl(NodeList nodeList, Map rules) {

Map config = MapUtils.getMap(rules, "site");

//过滤url参数

Map urlFilter = new HashMap();

List> list = (List>) MapUtils.getObject(config, "urlfilter");

for (Map m : list) {

String role = MapUtils.getString(m, "role");

String text = MapUtils.getString(m, "text");

urlFilter.put(role, text);

}

List urls = paraseUrl(nodeList, urlFilter);

for (String u : urls) {

if (!StringUtils.isBlank(u) && !jedis.sismember(urlstore, u)) {

jedis.lpush(urlqueue, u);

jedis.sadd(urlstore, u);

}

}

}

@Override

protected void beforeParase(ParaserInfo paraserInfo) {

//To change body of implemented methods use File | Settings | File Templates.

}

public void parase(ParaserInfo paraserInfo) {

Map rules = paraserInfo.getRules();

Map config = MapUtils.getMap(rules, "site");

if (!rules.containsKey("site")) {

return;

}

//开始解析

domain = MapUtils.getString(config, "domain");

String url = MapUtils.getString(config, "startUrl");

urlqueue += domain;

urlstore += domain;

urlstoreProcessed += domain;

// process(url, paraserInfo.getRules());

jedis.lpush(urlqueue, url);

while ((url = jedis.rpop(urlqueue)) != null) {

System.out.println("url:" + url);

if (jedis.sadd(urlstoreProcessed, url) == 1) {

try {

process(url, paraserInfo.getRules());

} catch (Throwable e) {

System.err.println("出错啦...");

}

}

}

}

@Override

protected void afterParase(ParaserInfo paraserInfo) {

//To change body of implemented methods use File | Settings | File Templates.

}

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值