import java.util.*;importorg.htmlparser.Node;importorg.htmlparser.NodeFilter;importorg.htmlparser.Parser;importorg.htmlparser.filters.AndFilter;importorg.htmlparser.filters.HasAttributeFilter;importorg.htmlparser.filters.NodeClassFilter;importorg.htmlparser.filters.TagNameFilter;importorg.htmlparser.tags.BodyTag;importorg.htmlparser.tags.LinkTag;importorg.htmlparser.util.NodeList;importorg.htmlparser.util.ParserException;/*** httpclient与htmlparse对网页的解析
*
*@authorAdministrator
**/
public classHtmlparseUtil {
WebHttpClient util=newWebHttpClient();/*** 获得网页中的超链接,将href和text保存在Map中:map(href,text)
*@paramurl
*@paramcharset
*@return
*/
public MaplinkGet(String url, String charset) {
String content=util.getWebContentByGet(url,charset);
Map linkMap = new HashMap();try{//开始解析
Parser parser =Parser.createParser(content, charset);//过滤出标签
NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
NodeList list=parser.extractAllNodesThatMatch(linkFilter);
Node node= null;for (int i = 0; i < list.size(); i++) {
node=list.elementAt(i);//获得网页中的链接map(href,text)
linkMap.put(((LinkTag) node).getLink(), this.processText(((LinkTag) node).getLinkText()));
}
}catch(ParserException e) {
e.printStackTrace();
}returnlinkMap;
}/*** 获得网页
标签中的内容, 保存在body中*@paramurl
*@paramcharset
*@return
*/
publicString bodyGet(String url, String charset) {
String content=util.getWebContentByGet(url,charset);
String body= "";try{
Parser parser=Parser.createParser(content, charset);//过滤
标签NodeFilter bodyFilter = new NodeClassFilter(BodyTag.class);
NodeList list=parser.extractAllNodesThatMatch(bodyFilter);
Node node= null;for (int i = 0; i < list.size(); i++) {
node=list.elementAt(i);//获得网页内容 保存在content中
body =((BodyTag) node).getBody();
}
}catch(ParserException e) {
e.printStackTrace();
}returnbody;
}/*** 过滤出class为term的元素,并获得他们的文本
*@paramurl
*@paramcharset
*@return
*/
public MaptermGet(String url, String charset) {
String content=util.getWebContentByGet(url,charset);
Map map = new HashMap();try{//开始解析//过滤出class为term的元素
Parser parser =Parser.createParser(content, charset);
AndFilter filter=
new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("class","term"));
Node node= null;
NodeList nodeList=parser.parse(filter);for (int i = 0; i < nodeList.size(); i++) {
node=nodeList.elementAt(i);
map.put("term", node.toPlainTextString());
}//过滤出class为start-time的元素
Parser parser2 =Parser.createParser(content, charset);
AndFilter filter2=
new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("class","start-time"));
NodeList nodeList2=parser2.parse(filter2);for (int i = 0; i < nodeList2.size(); i++) {
node=nodeList2.elementAt(i);
map.put("start-time", node.toPlainTextString());
}//过滤出id为J_SingleEndTimeLabel的元素
Parser parser3 =Parser.createParser(content, charset);
AndFilter filter3=
new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("id","J_SingleEndTimeLabel"));
NodeList nodeList3=parser3.parse(filter3);for (int i = 0; i < nodeList3.size(); i++) {
node=nodeList3.elementAt(i);
map.put("end-time", node.toPlainTextString());
}//过滤出class为box post的
Parser parser4 =Parser.createParser(content, charset);
AndFilter filter4=
new AndFilter(new TagNameFilter("div"),new HasAttributeFilter("class","box post"));
NodeList nodeList4=parser4.parse(filter4);for (int i = 0; i < nodeList4.size(); i++) {
node=nodeList4.elementAt(i);
String temp=node.toPlainTextString().trim();
temp=temp.substring(10,20).trim();
map.put("pre-term", temp);
}//过滤出class为J_AwardNumber的元素
Parser parser5 =Parser.createParser(content, charset);//AndFilter filter5 =//new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("class","J_AwardNumber"));
NodeList nodeList5 = parser5.parse(new HasAttributeFilter("class","J_AwardNumber"));
StringBuffer buffer=newStringBuffer();for (int i = 0; i < nodeList5.size(); i++) {
node=nodeList5.elementAt(i);
buffer.append(","+node.toPlainTextString());
}
buffer.append("|");//过滤出class为blue J_AwardNumber的元素
Parser parser6 =Parser.createParser(content, charset);//AndFilter filter6 =//new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("class","blue J_AwardNumber"));
NodeList nodeList6 = parser6.parse(new HasAttributeFilter("class","blue J_AwardNumber"));for (int i = 0; i < nodeList6.size(); i++) {
node=nodeList6.elementAt(i);
buffer.append(node.toPlainTextString()+",");
}
map.put("numbers", buffer.toString());
}catch(ParserException e) {//TODO Auto-generated catch block
e.printStackTrace();
}returnmap;
}privateString processText(String content){
content=content.trim().replaceAll(" ", "");//content=content.replaceAll("
", "\n");//content=content.replaceAll("", "");//content=content.replaceAll("
}public static voidmain(String[] str) {
String url="http://caipiao.taobao.com/lottery/order/lottery_dlt.htm?type=1";
HtmlparseUtil util=newHtmlparseUtil();
Map map=util.termGet(url, "gb2312");
System.out.println("term="+map.get("term"));//第10074期
System.out.println("start-time="+map.get("start-time"));//
System.out.println("end-time="+map.get("end-time"));//
System.out.println("pre-term="+map.get("pre-term"));//
System.out.println("numbers="+map.get("numbers"));//
/*Map linkMap = util.linkGet(url, "gb2312");
for (String s : linkMap.keySet()) {
System.out.println(s + " = " + linkMap.get(s));
中的内容// if (s.startsWith("http")) {