java parseutil_HtmlparseUtil.java

import java.util.*;importorg.htmlparser.Node;importorg.htmlparser.NodeFilter;importorg.htmlparser.Parser;importorg.htmlparser.filters.AndFilter;importorg.htmlparser.filters.HasAttributeFilter;importorg.htmlparser.filters.NodeClassFilter;importorg.htmlparser.filters.TagNameFilter;importorg.htmlparser.tags.BodyTag;importorg.htmlparser.tags.LinkTag;importorg.htmlparser.util.NodeList;importorg.htmlparser.util.ParserException;/*** httpclient与htmlparse对网页的解析

*

*@authorAdministrator

**/

public classHtmlparseUtil {

WebHttpClient util=newWebHttpClient();/*** 获得网页中的超链接,将href和text保存在Map中:map(href,text)

*@paramurl

*@paramcharset

*@return

*/

public MaplinkGet(String url, String charset) {

String content=util.getWebContentByGet(url,charset);

Map linkMap = new HashMap();try{//开始解析

Parser parser =Parser.createParser(content, charset);//过滤出标签

NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);

NodeList list=parser.extractAllNodesThatMatch(linkFilter);

Node node= null;for (int i = 0; i < list.size(); i++) {

node=list.elementAt(i);//获得网页中的链接map(href,text)

linkMap.put(((LinkTag) node).getLink(), this.processText(((LinkTag) node).getLinkText()));

}

}catch(ParserException e) {

e.printStackTrace();

}returnlinkMap;

}/*** 获得网页

标签中的内容, 保存在body中

*@paramurl

*@paramcharset

*@return

*/

publicString bodyGet(String url, String charset) {

String content=util.getWebContentByGet(url,charset);

String body= "";try{

Parser parser=Parser.createParser(content, charset);//过滤

标签

NodeFilter bodyFilter = new NodeClassFilter(BodyTag.class);

NodeList list=parser.extractAllNodesThatMatch(bodyFilter);

Node node= null;for (int i = 0; i < list.size(); i++) {

node=list.elementAt(i);//获得网页内容 保存在content中

body =((BodyTag) node).getBody();

}

}catch(ParserException e) {

e.printStackTrace();

}returnbody;

}/*** 过滤出class为term的元素,并获得他们的文本

*@paramurl

*@paramcharset

*@return

*/

public MaptermGet(String url, String charset) {

String content=util.getWebContentByGet(url,charset);

Map map = new HashMap();try{//开始解析//过滤出class为term的元素

Parser parser =Parser.createParser(content, charset);

AndFilter filter=

new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("class","term"));

Node node= null;

NodeList nodeList=parser.parse(filter);for (int i = 0; i < nodeList.size(); i++) {

node=nodeList.elementAt(i);

map.put("term", node.toPlainTextString());

}//过滤出class为start-time的元素

Parser parser2 =Parser.createParser(content, charset);

AndFilter filter2=

new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("class","start-time"));

NodeList nodeList2=parser2.parse(filter2);for (int i = 0; i < nodeList2.size(); i++) {

node=nodeList2.elementAt(i);

map.put("start-time", node.toPlainTextString());

}//过滤出id为J_SingleEndTimeLabel的元素

Parser parser3 =Parser.createParser(content, charset);

AndFilter filter3=

new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("id","J_SingleEndTimeLabel"));

NodeList nodeList3=parser3.parse(filter3);for (int i = 0; i < nodeList3.size(); i++) {

node=nodeList3.elementAt(i);

map.put("end-time", node.toPlainTextString());

}//过滤出class为box post的

元素

Parser parser4 =Parser.createParser(content, charset);

AndFilter filter4=

new AndFilter(new TagNameFilter("div"),new HasAttributeFilter("class","box post"));

NodeList nodeList4=parser4.parse(filter4);for (int i = 0; i < nodeList4.size(); i++) {

node=nodeList4.elementAt(i);

String temp=node.toPlainTextString().trim();

temp=temp.substring(10,20).trim();

map.put("pre-term", temp);

}//过滤出class为J_AwardNumber的元素

Parser parser5 =Parser.createParser(content, charset);//AndFilter filter5 =//new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("class","J_AwardNumber"));

NodeList nodeList5 = parser5.parse(new HasAttributeFilter("class","J_AwardNumber"));

StringBuffer buffer=newStringBuffer();for (int i = 0; i < nodeList5.size(); i++) {

node=nodeList5.elementAt(i);

buffer.append(","+node.toPlainTextString());

}

buffer.append("|");//过滤出class为blue J_AwardNumber的元素

Parser parser6 =Parser.createParser(content, charset);//AndFilter filter6 =//new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("class","blue J_AwardNumber"));

NodeList nodeList6 = parser6.parse(new HasAttributeFilter("class","blue J_AwardNumber"));for (int i = 0; i < nodeList6.size(); i++) {

node=nodeList6.elementAt(i);

buffer.append(node.toPlainTextString()+",");

}

map.put("numbers", buffer.toString());

}catch(ParserException e) {//TODO Auto-generated catch block

e.printStackTrace();

}returnmap;

}privateString processText(String content){

content=content.trim().replaceAll(" ", "");//content=content.replaceAll("

", "\n");//content=content.replaceAll("", "");//content=content.replaceAll("

", "");//content=content.replaceAll("", "");//content=content.replaceAll(" ", "");

returncontent;

}public static voidmain(String[] str) {

String url="http://caipiao.taobao.com/lottery/order/lottery_dlt.htm?type=1";

HtmlparseUtil util=newHtmlparseUtil();

Map map=util.termGet(url, "gb2312");

System.out.println("term="+map.get("term"));//10074

System.out.println("start-time="+map.get("start-time"));//

System.out.println("end-time="+map.get("end-time"));//

System.out.println("pre-term="+map.get("pre-term"));//

System.out.println("numbers="+map.get("numbers"));//

/*Map linkMap = util.linkGet(url, "gb2312");

for (String s : linkMap.keySet()) {

System.out.println(s + " = " + linkMap.get(s));

//如果是个链接,则再获取它的

中的内容

// if (s.startsWith("http")) {

// util.bodyGet(s, "gb2312");

// }

}*/}

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值