HttpClient,htmlParse简易用法

最新推荐文章于 2021-03-01 22:52:17 发布

Jefry2008

最新推荐文章于 2021-03-01 22:52:17 发布

阅读量133

点赞数

分类专栏： Java 文章标签： Apache JavaScript ViewUI

本文链接：https://blog.csdn.net/Jefry2008/article/details/83936042

版权

Java 专栏收录该内容

8 篇文章 0 订阅

订阅专栏

[b]解析新华信托html[/b]

package com.zte.util;

import java.util.ArrayList;
import java.util.List;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.methods.GetMethod;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.visitors.NodeVisitor;

import com.zte.entity.NewInfo;

public class ParseHtml {

	private static List<NewInfo> newInfos = new ArrayList<NewInfo>();
    private static int pageNumber = 1;
    private static boolean isFirstPage = true;

    public static List<NewInfo> getNewInfos(String url) throws Exception{
    	 HttpClient client = new HttpClient();   
	     client.getHostConfiguration().setProxy("10.130.40.13",8026);
	     HttpMethod method = new GetMethod(url);
	     client.executeMethod(method);   
         parseNew(method.getResponseBodyAsString());
         for(int page =2;page<=pageNumber ; page++) {
        	 method = new GetMethod( url + "&pagenum="+page);
    	     client.executeMethod(method);   
             parseNew(method.getResponseBodyAsString());
         }

	     method.releaseConnection();   

	     return newInfos;
    }

	public static void parseNew(String content) {
		try {
			Parser parser = new Parser(content);
			NodeVisitor visitor = new NodeVisitorImpl();
			parser.visitAllNodesWith(visitor);

		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	private static class NodeVisitorImpl extends NodeVisitor {
		boolean tdTag = false;
		public void visitTag(Tag tag) {
			String href = tag.getAttribute("href");
			if (tag.getTagName().equalsIgnoreCase("TD")) {
				tdTag = true;
			} else if (tdTag && tag.getTagName().equalsIgnoreCase("A") && !href.contains("javascript")) {
				String title = tag.getFirstChild().toHtml().trim();
				String newTitle1 = title.replaceAll("\\s{1,}", " ");
				String newTitle = newTitle1.replace("•", ".");
				NewInfo newInfo = new NewInfo();
				newInfo.setHref(href);
				newInfo.setTitle(newTitle);
				newInfos.add(newInfo);
				tdTag = false;
			} 

			if(isFirstPage && tag.getTagName().equalsIgnoreCase("select")) {
				pageNumber = new Integer(tag.getLastChild().getFirstChild().toHtml().trim());
				isFirstPage = false;
			}

		}
	}

}

[b]信息实体[/b]

package com.zte.entity;

public class NewInfo {
	private String href;
	private String title;

	public String getHref() {
		return href;
	}

	public void setHref(String href) {
		this.href = href;
	}

	public String getTitle() {
		return title;
	}

	public void setTitle(String title) {
		this.title = title;
	}

}

[b]客户端调用[/b]

public class ParseHtmlTest {
   public static void main(String[] args) throws Exception {
	   String url = "http://www.nct-china.com/NewsList.aspx?lmid=55";
	  List<NewInfo> newInfos = ParseHtml.getNewInfos(url);
	  int i = 0;
		 for(NewInfo newInfo : newInfos) {

			System.out.println("链接地址：" + newInfo.getHref());
			System.out.println("链接内容：" + newInfo.getTitle());
		    i ++ ;	
		 }

		 System.out.println("共" +i+ "条记录");
   }
}

Jefry2008

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
HttpClient,htmlParse简易用法

[b]解析新华信托html[/b][code="java"]package com.zte.util;import java.util.ArrayList;import java.util.List;import org.apache.commons.httpclient.HttpClient;import org.apache.commons.httpclient.Htt...
复制链接

扫一扫

专栏目录