网络爬虫,,,spider

   这段时间要做一个门户网站,新闻模块的信息采集,谈到信息采集,就想到了网络爬虫,毕竟我们没有太多的经历去自己写新闻,那么sina,sohu,就不好意思了,借用一下信息,网络提倡资源共享,这也是我一直追求的,看了一下。我也曾经想过用爬虫爬一些网络上的资源,拿来主义。
   
package com.opensky.util;
import java.util.HashMap;
import java.util.Map;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.HasParentFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.parserapplications.filterbuilder.Filter;
import org.htmlparser.tags.BodyTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

/**
 * httpclient与htmlparse对网页的解析
 * 
 * @author Administrator
 * 
 */
public class HtmlparseUtil {
	WebHttpClient util = new WebHttpClient();

	/**
	 * 获得网页中的超链接,将href和text保存在Map中:map(href,text)
	 * 
	 * @param url
	 * @param charset
	 * @return
	 */
	public Map<String, String> linkGet(String url, String charset) {
		String content = util.getWebContentByGet(url, charset);
		Map<String, String> linkMap = new HashMap<String, String>();
		try {
			// 开始解析
			Parser parser = Parser.createParser(content, charset);
			// 过滤出<a></a>标签
			NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
			NodeList list = parser.extractAllNodesThatMatch(linkFilter);
			Node node = null;
			for (int i = 0; i < list.size(); i++) {
				node = list.elementAt(i);
				// 获得网页中的链接map(href,text)
				linkMap.put(((LinkTag) node).getLink(), this
						.processText(((LinkTag) node).getLinkText()));
			}
		} catch (ParserException e) {
			e.printStackTrace();
		}
		return linkMap;
	}

	/**
	 * 获得网页<body></body>标签中的内容, 保存在body中
	 * 
	 * @param url
	 * @param charset
	 * @return
	 */
	public String bodyGet(String url, String charset) {
		String content = util.getWebContentByGet(url, charset);
		String body = "";
		try {
			Parser parser = Parser.createParser(content, charset);
			// 过滤<body></body>标签
			NodeFilter bodyFilter = new NodeClassFilter(BodyTag.class);
			NodeList list = parser.extractAllNodesThatMatch(bodyFilter);
			Node node = null;
			for (int i = 0; i < list.size(); i++) {
				node = list.elementAt(i);
				// 获得网页内容 保存在content中
				body = ((BodyTag) node).getBody();
			}
		} catch (ParserException e) {
			e.printStackTrace();
		}
		return body;
	}

	/**
	 * 过滤出class为term的<span>元素,并获得他们的文本
	 * 
	 * @param url
	 * @param charset
	 * @return
	 */
	public Map<String, String> termGet(String url, String charset) {
		// 获得网页中的所有HTML内容
		String content = util.getWebContentByGet(url, charset);
		Map<String, String> map = new HashMap<String, String>();
		try {
			// 开始解析
			// 过滤出class为term的<span>元素
			Parser parser = Parser.createParser(content, charset);
			// TagNameFilter(标签名字)------HasAttributeFilter(属性,和属性的值)
			AndFilter filter = new AndFilter(new TagNameFilter("span"),
					new HasAttributeFilter("class", "term"));
			Node node = null;
			NodeList nodeList = parser.parse(filter);

			for (int i = 0; i < nodeList.size(); i++) {
				node = nodeList.elementAt(i);
				// System.out.println("-----------------------------node.toPlainTextString()--------------->");
				// System.out.println(node.toPlainTextString());
				map.put("term", node.toPlainTextString());
			}

			// 过滤出class为start-time的<span>元素
			Parser parser2 = Parser.createParser(content, charset);
			AndFilter filter2 = new AndFilter(new TagNameFilter("span"),
					new HasAttributeFilter("class", "start-time"));
			NodeList nodeList2 = parser2.parse(filter2);
			for (int i = 0; i < nodeList2.size(); i++) {
				node = nodeList2.elementAt(i);
				map.put("start-time", node.toPlainTextString());
			}
			// 过滤出id为J_SingleEndTimeLabel的<span>元素
			Parser parser3 = Parser.createParser(content, charset);
			AndFilter filter3 = new AndFilter(new TagNameFilter("span"),
					new HasAttributeFilter("id", "J_SingleEndTimeLabel"));
			NodeList nodeList3 = parser3.parse(filter3);
			for (int i = 0; i < nodeList3.size(); i++) {
				node = nodeList3.elementAt(i);
				map.put("end-time", node.toPlainTextString());
			}

			// 过滤出class为box post的<div>元素
			Parser parser4 = Parser.createParser(content, charset);
			AndFilter filter4 = new AndFilter(new TagNameFilter("div"),
					new HasAttributeFilter("class", "box post"));
			NodeList nodeList4 = parser4.parse(filter4);
			for (int i = 0; i < nodeList4.size(); i++) {
				node = nodeList4.elementAt(i);
				String temp = node.toPlainTextString().trim();
				temp = temp.substring(10, 20).trim();
				map.put("pre-term", temp);
			}

			// 过滤出class为J_AwardNumber的<span>元素
			Parser parser5 = Parser.createParser(content, charset);
			// AndFilter filter5 =
			// new AndFilter(new TagNameFilter("span"),new
			// HasAttributeFilter("class","J_AwardNumber"));
			NodeList nodeList5 = parser5.parse(new HasAttributeFilter("class",
					"J_AwardNumber"));
			StringBuffer buffer = new StringBuffer();
			for (int i = 0; i < nodeList5.size(); i++) {
				node = nodeList5.elementAt(i);
				buffer.append("," + node.toPlainTextString());
			}
			buffer.append("|");

			// 过滤出class为blue J_AwardNumber的<span>元素
			Parser parser6 = Parser.createParser(content, charset);
			// AndFilter filter6 =
			// new AndFilter(new TagNameFilter("span"),new
			// HasAttributeFilter("class","blue J_AwardNumber"));
			NodeList nodeList6 = parser6.parse(new HasAttributeFilter("class",
					"blue J_AwardNumber"));
			for (int i = 0; i < nodeList6.size(); i++) {
				node = nodeList6.elementAt(i);
				buffer.append(node.toPlainTextString() + ",");
			}

			map.put("numbers", buffer.toString());
		} catch (ParserException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

		return map;
	}

	/**
	 * 过滤出class为list_00f_f14的
	 * <ul>
	 * 元素,并获得其中
	 * <li>的文本,,新浪 国内新闻的,,国内要闻信息
	 * 
	 * @param url
	 * @param charset
	 * @return
	 */

	public Map<String, String> sinaChinaNewsGet(String url, String charset) {
		// 获得网页中的所有HTML内容
		String content = util.getWebContentByGet(url, charset);
		Map<String, String> map = new HashMap<String, String>();
		try {

			// 开始解析
			// 过滤出class为list_00f_f14的<ul>元素
			Parser parser = Parser.createParser(content, charset);
			// TagNameFilter(标签名字)------HasAttributeFilter(属性,和属性的值)
			//AndFilter filter = new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "list_00f_f14"));
			
			AndFilter filter=new AndFilter(new TagNameFilter("li"),new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "list_00f_f14"))));


			// TagNameFilter filter = new TagNameFilter("a");
			Node node = null;
			NodeList nodeList = parser.parse(filter);
			for (int i = 0; i < nodeList.size(); i++) {
				node = nodeList.elementAt(i);
				//System.out.println("------------------------>>>>国内新闻版块---新浪>>>>>>>>>>>>>>>>>>");
				//System.out.println("标题:"+node.toPlainTextString());
				map.put("title" + i, node.toPlainTextString());
				
				NodeList nodeChildList = node.getChildren();
				Node nodeChild = null;
				for (int j = 0; j < nodeChildList.size(); j++) {
					nodeChild = nodeChildList.elementAt(j);
					if (nodeChild instanceof LinkTag) {
						String hrefStr = ((LinkTag) nodeChild).getAttribute("href");
						//System.out.println("链接:"+hrefStr);
						map.put("href"+i, hrefStr);
					}
				}
				
			}
		} catch (ParserException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

		return map;
	}

	private String processText(String content) {
		content = content.trim().replaceAll(" ", "");
		// content=content.replaceAll("<p>", "\n");
		// content=content.replaceAll("</TD>", "");
		// content=content.replaceAll("</div>", "");
		// content=content.replaceAll("</a>", "");
		// content=content.replaceAll("<a href=.*>", "");
		return content;
	}

	
}


          

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值