Jericho HTML Parser是一个简单而功能强大的Java HTML 解析器库,可以分析和处理HTML 文档的一部分,包括一些通用的服务器端标签,同时也可以重新生成无法识别的或无效的HTML 。它也提供了一个有用的HTML 表单分析器。
下载地址:http://sourceforge.net/project/showfiles.php?group_id=101067
HttpClient作为HTTP客户端组件与服务器进行通讯,同时使用了jdom进行XML数据的解析。
* HttpClient 可以在http://jakarta.apache.org/commons/httpclient/downloads.html 下载
* HttpClient 用到了 Apache Jakarta common 下的子项目 logging,你可以从这个地址http://jakarta.apache.org/site/downloads /downloads_commons-logging.cgi下载到 common logging,从下载后的压缩包中取出 commons-logging.jar 加到 CLASSPATH 中
* HttpClient 用到了 Apache Jakarta common 下的子项目 codec,你可以从这个地址http://jakarta.apache.org/site/downloads /downloads_commons-codec.cgi 下载到最新的 common codec,从下载后的压缩包中取出 commons-codec-1.x.jar 加到 CLASSPATH 中
在对网页信息进行抓取时,主要会用到GET 方法
使用 HttpClient 需要以下 6 个步骤:
1. 创建 HttpClient 的实例
2. 创建某种连接方法的实例,在这里是 GetMethod。在 GetMethod 的构造函数中传入待连接的地址
3. 调用第一步中创建好的实例的 execute 方法来执行第二步中创建好的 method 实例
4. 读 response
5. 释放连接。无论执行方法是否成功,都必须释放连接
6. 对得到后的内容进行处理
在eclipse下建立工程 -->snatch
将上面下载的四个jar文件导入到项目路径中.
环境搭建完成
现在,首先介绍一下HttpClient的使用
在工程目录下创建test包,在包中创建Httpclient Test类
- package test;
- import java.io.IOException;
- import org.apache.commons.httpclient.*;
- import org.apache.commons.httpclient.methods.GetMethod;
- import org.apache.commons.httpclient.params.HttpMethodParams;
- public class HttpClientTest...{
- public static void main(String[] args) {
- //构造HttpClient的实例
- HttpClient httpClient = new HttpClient();
- //创建GET方法的实例
- GetMethod getMethod = new GetMethod( "http://www.google.com.cn" );
- //使用系统提供的默认的恢复策略
- getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,
- new DefaultHttpMethodRetryHandler());
- try {
- //执行getMethod
- int statusCode = httpClient.executeMethod(getMethod);
- if (statusCode != HttpStatus.SC_OK) {
- System.err.println("Method failed: "
- + getMethod.getStatusLine());
- }
- //读取内容
- byte [] responseBody = getMethod.getResponseBoy();
- //处理内容
- System.out.println(new String(responseBody));
- } catch (HttpException e) {
- //发生致命的异常,可能是协议不对或者返回的内容有问题
- System.out.println("Please check your provided http address!" );
- e.printStackTrace();
- } catch (IOException e) {
- //发生网络异常
- e.printStackTrace();
- } finally {
- //释放连接
- getMethod.releaseConnection();
- }
- }
- }
package test;
import java.io.IOException;
import org.apache.commons.httpclient.*;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
public class HttpClientTest...{
public static void main(String[] args) {
//构造HttpClient的实例
HttpClient httpClient = new HttpClient();
//创建GET方法的实例
GetMethod getMethod = new GetMethod("http://www.google.com.cn");
//使用系统提供的默认的恢复策略
getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,
new DefaultHttpMethodRetryHandler());
try {
//执行getMethod
int statusCode = httpClient.executeMethod(getMethod);
if (statusCode != HttpStatus.SC_OK) {
System.err.println("Method failed: "
+ getMethod.getStatusLine());
}
//读取内容
byte[] responseBody = getMethod.getResponseBoy();
//处理内容
System.out.println(new String(responseBody));
} catch (HttpException e) {
//发生致命的异常,可能是协议不对或者返回的内容有问题
System.out.println("Please check your provided http address!");
e.printStackTrace();
} catch (IOException e) {
//发生网络异常
e.printStackTrace();
} finally {
//释放连接
getMethod.releaseConnection();
}
}
}
这样得到的是页面的源代码.
这里 byte[] responseBody = getMethod.getResponseBoy();是读取内容
除此之外,我们还可以这样读取:
InputStream inputStream= getMethod.getResponseBodyAsStream();
String responseBody = getMethod.getResponseBodyAsString();
下面结合两者给个事例
取出http://www.ahcourt.gov.cn/gb/ahgy_2004/fyxw/index.html
中"信息快递"栏的前几条信息.
新建类CourtNews
- package test;
- import java.io.IOException;
- import java.util.ArrayList;
- import java.util.Iterator;
- import java.util.List;
- import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
- import org.apache.commons.httpclient.HttpClient;
- import org.apache.commons.httpclient.HttpException;
- import org.apache.commons.httpclient.HttpStatus;
- import org.apache.commons.httpclient.methods.GetMethod;
- import org.apache.commons.httpclient.params.HttpMethodParams;
- import au.id.jericho .lib.html .Element;
- import au.id.jericho .lib.html .HTMLElementName;
- import au.id.jericho .lib.html .Segment;
- import au.id.jericho .lib.html .Source;
- /** */ /**
- * @author oscar 07-5-17
- *
- */
- public class CourtNews {
- private int newsCount = 3 ;
- private List newsList = new ArrayList();
- public int getNewsCount() {
- return newsCount;
- }
- public void setNewsCount( int newsCount) {
- this .newsCount = newsCount;
- }
- public List getNewsList() {
- HttpClient httpClient = new HttpClient();
- GetMethod getMethod = new GetMethod(
- "http://www.ahcourt.gov.cn/gb/ahgy_2004/fyxw/index.html " );
- getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,
- new DefaultHttpMethodRetryHandler());
- try {
- int statusCode = httpClient.executeMethod(getMethod);
- if (statusCode != HttpStatus.SC_OK) {
- System.err
- .println("Method failed:" + getMethod.getStatusLine());
- }
- String responseBody = getMethod.getResponseBodyAsString();
- responseBody = new String(responseBody.getBytes( "ISO-8859-1" ),
- "GB2312" );
- Source source = new Source(responseBody);
- int tableCount = 0 ;
- for (Iterator i = source.findAllElements(HTMLElementName.TABLE)
- .iterator(); i.hasNext(); tableCount++) {
- Segment segment = (Segment) i.next();
- if (tableCount == 13 ) {
- int hrefCount = 0 ;
- for (Iterator j = segment
- .findAllElements(HTMLElementName.A).iterator(); j
- .hasNext();) {
- Segment childsegment = (Segment) j.next();
- String title = childsegment.extractText();
- title.replace(" " , " " );
- title = trimTitle(title);
- Element childelement = (Element) childsegment;
- if (hrefCount < newsCount) {
- String[] news = new String[] {
- title,
- "http://www.ahcourt.gov.cn"
- + childelement
- .getAttributeValue("href" ) };
- newsList.add(news);
- hrefCount++;
- }
- }
- }
- }
- } catch (HttpException e) {
- System.out.println("please check your provided http address!" );
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- getMethod.releaseConnection();
- }
- return newsList;
- }
- private String trimTitle(String title) {
- String titlenew = "" ;
- for ( int i = 0 ; i < title.length(); i++) {
- if (Character.isSpaceChar(title.charAt(i)))
- titlenew += " " ;
- else {
- titlenew += title.charAt(i);
- }
- }
- return titlenew;
- }
- public static void main(String[] args) {
- // TODO Auto-generated method stub
- CourtNews justice = new CourtNews();
- justice.setNewsCount(4 );
- List list = justice.getNewsList();
- Iterator it = list.iterator();
- while (it.hasNext()) {
- String[] news = (String[]) it.next();
- System.out.println(news[0 ]);
- System.out.println(news[1 ]);
- }
- }
- }