今天,遇到一个问题,顺便把它写下来。今天在抓取一个网站的时候,看起来像一个简单的页面,人工
浏览的话,是完全没问题,一旦有程序开始抓取,问题就来了。老提示我链接错误。一开始,一头冒烟,为啥呢?
细心想了一下,难道是cookie做怪,好,那就找一下我以前用cookie提交访问页面的程序,结果不知道放到哪里去了。
花了差不多两个小时,找到了一份源代码。下面是我修改过的程序
package org.qichao.mode;
import java.io.*;
import org.apache.commons.httpclient.*;
import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.methods.*;
import org.apache.commons.httpclient.params.HttpMethodParams;
public class UR {
public static void main(String[] args) {
HttpClient httpClient = new HttpClient();
httpClient.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);
// 创建GET方法的实例
GetMethod getMethod = new GetMethod("http://www.51ys.com/See_Url_one.asp?operator=25041782C95478FEE686A09");
getMethod.setRequestHeader("Host","cards.360114.com");
getMethod.setRequestHeader("User-Agent","Mozilla/5.0 (Windows; U; Windows NT 5.2; zh-CN; rv:1.8.1.20) Gecko/20081217 Firefox/2.0.0.20");
getMethod.setRequestHeader("Accept","text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5");
getMethod.setRequestHeader("Accept-Language","zh-cn,zh;q=0.5");
getMethod.setRequestHeader("Accept-Encoding","gzip,deflate");
getMethod.setRequestHeader("Accept-Charset","gb2312,utf-8;q=0.7,*;q=0.7");
getMethod.setRequestHeader("Keep-Alive","300");
getMethod.setRequestHeader("Connection","keep-alive");
getMethod.setRequestHeader("Referer","http://www.360114.com/yellowpage/query.asp?Call=77&h1=GSLANVG&Spara=3&Cpara=&h2=HSIFJTNJHIH&Tpara=&h3=EDJYLUE&h5=@GAXBXFR@R@&scall=");
getMethod.setRequestHeader("Cookie","__utmz=76121879.1230526182.3.3.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=%E4%BC%81%E4%B8%9A%E9%BB%84%E9%A1%B5%E5%A4%A7%E5%85%A8; __utma=76121879.2444684742963329000.1230517736.1230526182.1230530122.4; __utmc=76121879; ASPSESSIONIDAAATASRQ=IFDOECBAHDBKJFKKMKDOEFCP");
getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,
new DefaultHttpMethodRetryHandler());
try {
// 执行getMethod
int statusCode = httpClient.executeMethod(getMethod);
if (statusCode != HttpStatus.SC_OK) {
System.err.println("Method failed: "
+ getMethod.getStatusLine());
}
// 读取内容
byte[] responseBody = getMethod.getResponseBody();
String content = new String(responseBody);
// 处理内容
System.out.println(new String(responseBody));
} catch (HttpException e) {
// 发生致命的异常,可能是协议不对或者返回的内容有问题
System.out.println("Please check your provided http address!");
e.printStackTrace();
} catch (IOException e) {
// 发生网络异常
e.printStackTrace();
} finally {
// 释放连接
// getMethod.releaseConnection();
}
}
}