写了一个爬虫需要抓取某个网站的相关信息,文字信息都没有问题,但是 图片信息对方网站使用了防盗链技术,抓取图片的时候出现 403 错误
最后进过各种尝试终于成功
上代码,以备以后查看
public static String getHttpPicToCS(String url, String picid)
throws Exception {
String str = "";
HttpClient client = new DefaultHttpClient();
int code = 0;
HttpGet httpGet = new HttpGet(url);
InputStream in = null;
try {
long t1 = System.currentTimeMillis();
// 设置连接超时时间(单位毫秒)
httpGet.getParams().setParameter(
CoreConnectionPNames.CONNECTION_TIMEOUT,
HTTPCLIENT_CONNECTION_TIMEOUT);
httpGet.setHeader("Referer", url); // 专门应对防盗链 url 可以写对方网站url
// 设置读数据超时时间(单位毫秒)
httpGet.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT,
HTTPCLIENT_SO_TIMEOUT);
httpGet.getParams().setParameter("User-Agent","Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0)" );
HttpResponse response = client.execute(httpGet);
HttpEntity entity = response.getEntity();
code = response.getStatusLine().getStatusCode();
if (code == 200 && entity != null) {
in = entity.getContent();
str = UpdateFile.updateFile(in, picid);
logger.info("str=" + str);
} else {
httpGet.abort();
return null;
}
long t2 = System.currentTimeMillis();
long to = t2 - t1;
logger.info("---HttpUtil---url:" + url + " , time:" + to
+ " ms , code:" + code);
} catch (Exception e) {
logger.error("------HttpUtil-----error--url:" + url + " , "
+ e.getMessage());
throw e;
} finally {
if(in!=null)
in.close();
httpGet.releaseConnection();
client.getConnectionManager().shutdown();
}
return str;
}