新建maven项目
在pom.xml中添加坐标
首先在maven存储库搜索用来抓数据的HttpClient的坐标
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.2</version>
</dependency>
还有Apache Log4j
(日志)
<!-- https://mvnrepository.com/artifact/org.slf4j/slf4j-log4j12 -->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.25</version>
<!-- -->
<!--<scope>test</scope>-->
</dependency>
将这两个坐标添加到<dependencies></dependencies>
中
然后在main > resources中新建log4j.properties
并在这个文件中写入
log4j.rootLogger=DEBUG,A1
log4j.logger.cn.itcast = DEBUG
log4j.appender.A1=org.apache.log4j.ConsoleAppender
log4j.appender.A1.layout=org.apache.log4j.PatternLayout
log4j.appender.A1.layout.ConversionPattern=%-d{yyyy-MM-dd HH:mm:ss,SSS} [%t] [%c]-[%p] %m%n
爬取一个网站源码
在main > java下新建一个包com.stuSpider
新建一个CrawlerFirst
类
打开浏览器 – 创建HttpClient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
输入网址 – 使用HttpGet对象请求
请求方法为GET, POST以后再说
HttpGet httpGet = new HttpGet("https://csdn.net");
按回车,发起请求,返回响应 – 使用HttpClient发起请求
用response接收响应
CloseableHttpResponse response = httpClient.execute(httpGet);
解析响应,获取数据
首先要判断状态码是否和浏览器上的一致 正常是200
response.getStatusLine().getStatusCode()
可以获取状态码
用httpEntity
接收响应体 response.getEntity()
EntityUtils.toString(httpEntity, "utf8");
得到网页Response里的网页源码
if (response.getStatusLine().getStatusCode() == 200) {
HttpEntity httpEntity = response.getEntity();
String content = EntityUtils.toString(httpEntity, "utf8");
System.out.println(content);
}
最后贴上我的代码
pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>cn.stuspider</groupId>
<artifactId>stuspider-first</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.2</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.slf4j/slf4j-log4j12 -->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.25</version>
<!-- <scope>test</scope>-->
</dependency>
</dependencies>
</project>
CrawlerFirst.java
package cn.stuspider;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
public class CrawlerFirst {
public static void main(String[] args) throws IOException {
//1.打开浏览器---创建HttpClient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
//2.输入网址---请求方法为GET,使用HttpGet对象
HttpGet httpGet = new HttpGet("https://csdn.net");
//3.按回车,发起请求,返回响应---使用HttpClient发起请求,response接收响应
CloseableHttpResponse response = httpClient.execute(httpGet);
//4.解析响应,获取数据
//首先判断状态码是否和浏览器上的一致 正常是200 response.getStatusLine().getStatusCode()获取状态码
//用httpEntity接收响应体(response.getEntity())
//EntityUtils.toString(httpEntity, "utf8");得到网页Response里的网页源码
if (response.getStatusLine().getStatusCode() == 200) {
HttpEntity httpEntity = response.getEntity();
String content = EntityUtils.toString(httpEntity, "utf8");
System.out.println(content);
}
}
}