java爬虫,通过httpGet传入url获取网站前端html代码,调用得到字符串使用jsoup解析需要的数据
防止网站拦截,设置请求头使其不断切换浏览器进入
需要导入依赖
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.12</version>
</dependency>
package com.example.demo.utils;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.springframework.stereotype.Component;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.UUID;
@Component
public class HttpUtils {
private PoolingHttpClientConnectionManager cm;
public HttpUtils() {
//初始化连接池
this.cm = new PoolingHttpClientConnectionManager();
//设置最大连接数
cm.setMaxTotal(100);
//设置每个主机最大连接数
cm.setDefaultMaxPerRoute(10);
}
/**
* 根据请求地址下载页面数据
*
* @param url
* @return 页面数据
*/
public String doGetHtml(String url) {
//从连接池中取出httpClient对象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
//创建httpGet,传入url
HttpGet httpGet = new HttpGet(url);
//设置请求信息
httpGet.setConfig(getConfig());
// 设置请求头, 伪装用户不断切换浏览器
setHeaders(httpGet);
CloseableHttpResponse response = null;
try {
// 使用HttpClient发起请求, 获取响应
response = httpClient.execute(httpGet);
// 解析响应, 返回结果
if (response.getStatusLine().getStatusCode() == 200) {
// 判断响应体Entity是否不为空, 如果不为空就可以使用EntityUtils
if (response.getEntity() != null) {
String content = EntityUtils.toString(response.getEntity(), "utf8");
return content;
}
}
} catch (IOException e) {
e.printStackTrace();
} finally {
// 关闭response
if (response != null) {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
// 返回空串
return "";
}
/**
* 下载图片
*
* @param url
* @return 图片名称
*/
public String doGetImage(String url) {
//获取HttpClient对象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
//获取httpGet对象
HttpGet httpGet = new HttpGet(url);
//设置请求信息
httpGet.setConfig(getConfig());
//设置请求头,伪装用户不断切换浏览器
setHeaders(httpGet);
CloseableHttpResponse response = null;
try {
//执行Url得到值
response = httpClient.execute(httpGet);
//返回200信息时返回
if (response.getStatusLine().getStatusCode() == 200) {
//返回信息不为空,如果不为空就可以使用EntityUtils
if (response.getEntity() != null) {
//获取图片后缀
String extName = url.substring(url.lastIndexOf("."));
//随机生成图片名
String picName = UUID.randomUUID().toString() + extName;
//文件输出流
FileOutputStream fos = new FileOutputStream(new File("F:\\WzText\\crawler\\images\\" + picName));
//将图片写入指定地点并命名
response.getEntity().writeTo(fos);
// 返回图片名称
return picName;
}
}
} catch (IOException e) {
e.printStackTrace();
}
// 如果下载失败, 返回空串
return "";
}
private RequestConfig getConfig() {
RequestConfig config = RequestConfig.custom()
.setConnectTimeout(1000) // 创建连接的最长时间
.setConnectionRequestTimeout(500) // 获取连接的最长时间
.setSocketTimeout(10000) // 数据传输的最长时间
.build();
return config;
}
//使用HttpClient爬取数据时, 为了防止被网站拦截, 应该设置请求头
// 设置请求头
private void setHeaders(HttpGet httpGet) {
httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; rv:6.0.2) Gecko/20100101 Firefox/6.0.2");
}
}