描述
在用java进行爬虫时,直接用HTTPClient访问获取相关网站数据时,网站会反馈一个登录看界面的信息,如某*东。
如图为网站反馈的登录信息
打开链接,发现这是爬虫某东的登录界面,者并不是我们需要的html界面
分析
直接用HTTPClient访问,我们的访问头会是空的,网站会以为有人在攻击。
解决方案
这时候我们需要为我们的访问头加上相关信息,也就时模拟我们的访问是用浏览器进行访问的,使用浏览器代理代理访问的,而不是采用空白攻击的手段。
为我们的访问设置访问头,也就数模拟我们在使用浏览器在进行访问爬虫:
httpGet.setHeader(“User-Agent”,“Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36”);
package jd.util;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.springframework.stereotype.Component;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.UUID;
@Component
public class HttpUtils {
private PoolingHttpClientConnectionManager cm;
private RequestConfig getConfig(){
RequestConfig config = RequestConfig.custom()
.setConnectTimeout(1000)
.setConnectionRequestTimeout(500)
.setSocketTimeout(10000)
.build();
return config;
}
public HttpUtils() {
this.cm = new PoolingHttpClientConnectionManager();
this.cm.setMaxTotal(100);
this.cm.setDefaultMaxPerRoute(10);
}
// 根据请求地址下载数据
public String doGet(String url) throws IOException {
//获取HTTPClient对象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(this.cm).build();
//创建httpClient请求对象,设置url地址
HttpGet httpGet = new HttpGet(url);
//模拟浏览器在访问网站,如果不模拟那么,网站将会认为我们在攻击
httpGet.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36");
httpGet.setConfig(this.getConfig());
//使用HTTPClient发起请求获取响应
CloseableHttpResponse response = httpClient.execute(httpGet);
if (response.getStatusLine().getStatusCode() == 200){
if (response.getEntity() != null){
String content = EntityUtils.toString(response.getEntity(), "utf8");
return content;
} else {
return "";
}
} else {
response.close();
}
return "";
}
public String doGetImage(String url) throws IOException {
{
//获取HTTPClient对象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(this.cm).build();
//创建httpClient请求对象,设置url地址
HttpGet httpGet = new HttpGet(url);
httpGet.setConfig(this.getConfig());
//使用HTTPClient发起请求获取响应
CloseableHttpResponse response = httpClient.execute(httpGet);
if (response.getStatusLine().getStatusCode() == 200){
if (response.getEntity() != null){
String extName = url.substring(url.lastIndexOf("."));
String picName = UUID.randomUUID().toString()+extName;
OutputStream outputStream = new FileOutputStream(new File("D:\\Intellij\\crawler\\src\\main\\resources\\imgs\\"+picName));
response.getEntity().writeTo(outputStream);
return picName;
}
} else {
response.close();
}
return "";
}
}
}
输出
此时控制台已经获取了爬虫网站的HTML文本,而不是返回登录链接,接下来可以处理文本数据了