Java爬虫
文章目录
前言
提示:记录自己的Java爬虫学习,虽然是在看视频学习
一、网络爬虫是什么?
示例:网络爬虫也叫网络机器人, 是一种可以按照一定规则自动采集互联网信息的程序或脚本, 爬虫一般分为数据采集, 处理, 储存三个部分, 从若干初始网页的URL开始抓取网页, 不断获取页面上的URL放入队列直到满足系统的一定条件停止。
二、使用
1.配置环境
代码如下:
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.2</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.slf4j/slf4j-log4j12 -->
<!--日志-->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.25</version>
<!-- <scope>test</scope>-->
</dependency>
2.创建 slf4j 日志配置文件
在 resources 目录下创建 log4j.properties 文件
# A1 在控制台显示日志
log4j.rootLogger=DEBUG,A1
log4j.logger.cn.itcast = DEBUG
log4j.appender.A1=org.apache.log4j.ConsoleAppender
log4j.appender.A1.layout=org.apache.log4j.PatternLayout
log4j.appender.A1.layout.ConversionPattern=%-d{
yyyy-MM-dd HH??ss,SSS} [%t] [%c]-[%p] %m%n
三、HttpClient
使用Java的HTTP协议客户端HttpClient技术,实现抓取网页数据
1.GET请求
package cn.ayulong.crawler.test;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
public class HttpGetTest {
public static void main(String[] args) {
// 创建HttpClient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
// 创建HttpGet对象, 设置url访问地址
HttpGet httpGet = new HttpGet("https://www.baidu.com/");
CloseableHttpResponse response = null;
try {
// 使用 HttpClient 发起请求, 获取 response
response = httpClient.execute(httpGet);
// 解析响应
if (response.getStatusLine().getStatusCode() == 200) {
String content = EntityUtils.toString(response.getEntity(), "utf8");
System.out.println(content.length());
}
} catch(IOException e) {
e.printStackTrace();
} finally {
// 关闭 response
try {
response.close();
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
2.带参数GET请求
package cn.ayulong.crawler.test;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
public class HttpGetParamTest {
public static void main(String[] args) throws Exception {
// 创建HttpClient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
// 设置请求地址是: http://yun.itheima.com/search?keys=Java
// 创建URIBuilder
URIBuilder uriBuilder = new URIBuilder("https://www.baidu.com/");
//设置参数
uriBuilder.setParameter("keys","Java");
// 创建HttpGet对象, 设置url访问地址
HttpGet httpGet = new HttpGet(uriBuilder.build());
System.out.println("发起请求的信息:" + httpGet);
CloseableHttpResponse response = null;
try {
// 使用 HttpClient 发起请求, 获取 response
response = httpClient.execute(httpGet);
// 解析响应
if (response.getStatusLine().getStatusCode() == 200) {
String content = EntityUtils.toString(response.getEntity(), "utf8");
System.out.println(content.length());
}
} catch(IOException e) {
e.printStackTrace();
} finally {
// 关闭 response
try {
response<