这段时间要做一个门户网站,新闻模块的信息采集,谈到信息采集,就想到了网络爬虫,毕竟我们没有太多的经历去自己写新闻,那么sina,sohu,就不好意思了,借用一下信息,网络提倡资源共享,这也是我一直追求的,看了一下。我也曾经想过用爬虫爬一些网络上的资源,拿来主义。
package com.opensky.util;
import java.util.HashMap;
import java.util.Map;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.HasParentFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.parserapplications.filterbuilder.Filter;
import org.htmlparser.tags.BodyTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
/**
* httpclient与htmlparse对网页的解析
*
* @author Administrator
*
*/
public class HtmlparseUtil {
WebHttpClient util = new WebHttpClient();
/**
* 获得网页中的超链接,将href和text保存在Map中:map(href,text)
*
* @param url
* @param charset
* @return
*/
public Map<String, String> linkGet(String url, String charset) {
String content = util.getWebContentByGet(url, charset);
Map<String, String> linkMap = new HashMap<String, String>();
try {
// 开始解析
Parser parser = Parser.createParser(content, charset);
// 过滤出<a></a>标签
NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
NodeList list = parser.extractAllNodesThatMatch(linkFilter);
Node node = null;
for (int i = 0; i < list.size(); i++) {
node = list.elementAt(i);
// 获得网页中的链接map(href,text)
linkMap.put(((LinkTag) node).getLink(), this
.processText(((LinkTag) node).getLinkText()));
}
} catch (ParserException e) {
e.printStackTrace();
}
return linkMap;
}
/**
* 获得网页<body></body>标签中的内容, 保存在body中
*
* @param url
* @param charset
* @return
*/
public String bodyGet(String url, String charset) {
String content = util.getWebContentByGet(url, charset);
String body = "";
try {
Parser parser = Parser.createParser(content, charset);
// 过滤<body></body>标签
NodeFilter bodyFilter = new NodeClassFilter(BodyTag.class);
NodeList list = parser.extractAllNodesThatMatch(bodyFilter);
Node node = null;
for (int i = 0; i < list.size(); i++) {
node = list.elementAt(i);
// 获得网页内容 保存在content中
body = ((BodyTag) node).getBody();
}
} catch (ParserException e) {
e.printStackTrace();
}
return body;
}
/**
* 过滤出class为term的<span>元素,并获得他们的文本
*
* @param url
* @param charset
* @return
*/
public Map<String, String> termGet(String url, String charset) {
// 获得网页中的所有HTML内容
String content = util.getWebContentByGet(url, charset);
Map<String, String> map = new HashMap<String, String>();
try {
// 开始解析
// 过滤出class为term的<span>元素
Parser parser = Parser.createParser(content, charset);
// TagNameFilter(标签名字)------HasAttributeFilter(属性,和属性的值)
AndFilter filter = new AndFilter(new TagNameFilter("span"),
new HasAttributeFilter("class", "term"));
Node node = null;
NodeList nodeList = parser.parse(filter);
for (int i = 0; i < nodeList.size(); i++) {
node = nodeList.elementAt(i);
// System.out.println("-----------------------------node.toPlainTextString()--------------->");
// System.out.println(node.toPlainTextString());
map.put("term", node.toPlainTextString());
}
// 过滤出class为start-time的<span>元素
Parser parser2 = Parser.createParser(content, charset);
AndFilter filter2 = new AndFilter(new TagNameFilter("span"),
new HasAttributeFilter("class", "start-time"));
NodeList nodeList2 = parser2.parse(filter2);
for (int i = 0; i < nodeList2.size(); i++) {
node = nodeList2.elementAt(i);
map.put("start-time", node.toPlainTextString());
}
// 过滤出id为J_SingleEndTimeLabel的<span>元素
Parser parser3 = Parser.createParser(content, charset);
AndFilter filter3 = new AndFilter(new TagNameFilter("span"),
new HasAttributeFilter("id", "J_SingleEndTimeLabel"));
NodeList nodeList3 = parser3.parse(filter3);
for (int i = 0; i < nodeList3.size(); i++) {
node = nodeList3.elementAt(i);
map.put("end-time", node.toPlainTextString());
}
// 过滤出class为box post的<div>元素
Parser parser4 = Parser.createParser(content, charset);
AndFilter filter4 = new AndFilter(new TagNameFilter("div"),
new HasAttributeFilter("class", "box post"));
NodeList nodeList4 = parser4.parse(filter4);
for (int i = 0; i < nodeList4.size(); i++) {
node = nodeList4.elementAt(i);
String temp = node.toPlainTextString().trim();
temp = temp.substring(10, 20).trim();
map.put("pre-term", temp);
}
// 过滤出class为J_AwardNumber的<span>元素
Parser parser5 = Parser.createParser(content, charset);
// AndFilter filter5 =
// new AndFilter(new TagNameFilter("span"),new
// HasAttributeFilter("class","J_AwardNumber"));
NodeList nodeList5 = parser5.parse(new HasAttributeFilter("class",
"J_AwardNumber"));
StringBuffer buffer = new StringBuffer();
for (int i = 0; i < nodeList5.size(); i++) {
node = nodeList5.elementAt(i);
buffer.append("," + node.toPlainTextString());
}
buffer.append("|");
// 过滤出class为blue J_AwardNumber的<span>元素
Parser parser6 = Parser.createParser(content, charset);
// AndFilter filter6 =
// new AndFilter(new TagNameFilter("span"),new
// HasAttributeFilter("class","blue J_AwardNumber"));
NodeList nodeList6 = parser6.parse(new HasAttributeFilter("class",
"blue J_AwardNumber"));
for (int i = 0; i < nodeList6.size(); i++) {
node = nodeList6.elementAt(i);
buffer.append(node.toPlainTextString() + ",");
}
map.put("numbers", buffer.toString());
} catch (ParserException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return map;
}
/**
* 过滤出class为list_00f_f14的
* <ul>
* 元素,并获得其中
* <li>的文本,,新浪 国内新闻的,,国内要闻信息
*
* @param url
* @param charset
* @return
*/
public Map<String, String> sinaChinaNewsGet(String url, String charset) {
// 获得网页中的所有HTML内容
String content = util.getWebContentByGet(url, charset);
Map<String, String> map = new HashMap<String, String>();
try {
// 开始解析
// 过滤出class为list_00f_f14的<ul>元素
Parser parser = Parser.createParser(content, charset);
// TagNameFilter(标签名字)------HasAttributeFilter(属性,和属性的值)
//AndFilter filter = new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "list_00f_f14"));
AndFilter filter=new AndFilter(new TagNameFilter("li"),new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "list_00f_f14"))));
// TagNameFilter filter = new TagNameFilter("a");
Node node = null;
NodeList nodeList = parser.parse(filter);
for (int i = 0; i < nodeList.size(); i++) {
node = nodeList.elementAt(i);
//System.out.println("------------------------>>>>国内新闻版块---新浪>>>>>>>>>>>>>>>>>>");
//System.out.println("标题:"+node.toPlainTextString());
map.put("title" + i, node.toPlainTextString());
NodeList nodeChildList = node.getChildren();
Node nodeChild = null;
for (int j = 0; j < nodeChildList.size(); j++) {
nodeChild = nodeChildList.elementAt(j);
if (nodeChild instanceof LinkTag) {
String hrefStr = ((LinkTag) nodeChild).getAttribute("href");
//System.out.println("链接:"+hrefStr);
map.put("href"+i, hrefStr);
}
}
}
} catch (ParserException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return map;
}
private String processText(String content) {
content = content.trim().replaceAll(" ", "");
// content=content.replaceAll("<p>", "\n");
// content=content.replaceAll("</TD>", "");
// content=content.replaceAll("</div>", "");
// content=content.replaceAll("</a>", "");
// content=content.replaceAll("<a href=.*>", "");
return content;
}
}