网络爬虫
网络爬虫(又被称为网页蜘蛛,网络机器人,在FOAF社区中间,更经常的称为网页追逐者),是一种按照一定的规则,自动的抓取万维网信息的程序或者脚本。另外一些不常使用的名字还有蚂蚁,自动索引,模拟程序或者蠕虫。
使用环境
采用环境为:
JDK1.8
IDEA和自带的Maven
导入pom.xml依赖
首先导入http访问
<dependencies>
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<!-- HttpClient -->
<!--http的访问-->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.2</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.slf4j/slf4j-log4j12 -->
<!--日志-->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.25</version>
<scope>test</scope>
</dependency>
</dependencies>
还需要在resources目录下新建一file文件加入slf4j日志配置
在 resources 目录下创建 log4j.properties 文件, 并添加以下配置
1 在控制台显示日志
log4j.rootLogger=DEBUG,A1
log4j.logger.cn.itcast = DEBUG
log4j.appender.A1=org.apache.log4j.ConsoleAppender
log4j.appender.A1.layout=org.apache.log4j.PatternLayout
log4j.appender.A1.layout.ConversionPattern=%-d{yyyy-MM-dd HH??ss,SSS} [%t] [%c]-[%p] %m%n
创建简单的爬虫程序爬取HTML代码
在java目录下创建目录,创建程序命名
package cn.itcast.crawler.test;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import sun.net.www.http.HttpClient;
public class CrawlerFirst {
public static void main(String[] args) throws Exception{
//1.打开游览器,创建HttpClient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
//2.输入地址,发起get请求创建HttpGet对象
HttpGet httpGet =new HttpGet("http://www.itcast.cn");
//3.按回车,发起请求,返回响应,使用HttpClient对象发起请求
CloseableHttpResponse response = httpClient.execute(httpGet);
//4.解析响应,获取数据
//判断状态码是否为200
if(response.getStatusLine().getStatusCode()==200)
{
HttpEntity httpEntity = response.getEntity();
String content = EntityUtils.toString(httpEntity, "utf8");
System.out.println(content);
}
}
}
运用Get来获取
这一步需要在pom.xml中注释掉 < scope > test</ scope >
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.25</version>
<!--<scope>test</scope>-->
</dependency>
package cn.itcast.crawler.test;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
public class HttpGetTest {
public static void main(String[] args) {
//创建HttpClient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
//创建HttpGet对象,设置url访问地址
HttpGet httpGet = new HttpGet("http://www.itcast.cn");
//使用HttpClient发起请求,获取response
CloseableHttpResponse response =null;
try {
response = httpClient.execute(httpGet);
if(response.getStatusLine().getStatusCode()==200)
{
String content = EntityUtils.toString(response.getEntity(), "utf8");
System.out.println(content.length());
}
}
catch (IOException e) {
e.printStackTrace();
}
//关闭访问请求
finally{
try {
response.close();
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
运用Get参数来获取
package cn.itcast.crawler.test;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.net.URISyntaxException;
public class HttpGetTest {
public static void main(String[] args) throws Exception {
//创建HttpClient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
//设置请求地址是: http://yun.itheima.com/search?keys=Java
//创建URIBuider
URIBuilder uriBuilder= new URIBuilder("http://yun.itheima.com/search");
//设置参数
uriBuilder.setParameter("keys","Java");
//创建HttpGet对象 ,设置url访问地址
HttpGet httpGet = new HttpGet(uriBuilder.build());
//使用HttpClient发起请求,获取response
CloseableHttpResponse response =null;
try {
response = httpClient.execute(httpGet);
if(response.getStatusLine().getStatusCode()==200)
{
String content = EntityUtils.toString(response.getEntity(), "utf8");
}
}
catch (IOException e) {
e.printStackTrace();
}
finally{
try {
response.close();
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
运用Post来获取
只需要把HttpGet对象改为HttpPost对象可以了
package cn.itcast.crawler.test;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
public class HttpGetTest {
public static void main(String[] args) {
//创建HttpClient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
//创建HttpGet对象,设置url访问地址
HttpPost httpPost = new HttpPost("http://www.itcast.cn");
//使用HttpClient发起请求,获取response
CloseableHttpResponse response =null;
try {
response = httpClient.execute(httpPost);
if(response.getStatusLine().getStatusCode()==200)
{
String content = EntityUtils.toString(response.getEntity(), "utf8");
System.out.println(content.length());
}
}
catch (IOException e) {
e.printStackTrace();
}
//关闭访问请求
finally{
try {
response.close();
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
运用Post请求带参数
package cn.itcast.crawler.test;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.util.ArrayList;
public class HttpPostTest {
public static void main(String[] args) throws Exception {
//创建HttpClient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
//设置请求地址是: http://yun.itheima.com/search?keys=Java
//创建HttpPost对象,设置url访问地址
HttpPost httpPost =new HttpPost("http://yun.itheima.com/search");
//声明List,封装表单参数
ArrayList<NameValuePair> params=new ArrayList<NameValuePair>();
params.add(new BasicNameValuePair("key","Java"));
//创建表单的Entity对象,第一个参数为封装好的表单数据,第二个参数为编码
UrlEncodedFormEntity formEntity=new UrlEncodedFormEntity(params,"utf8");
//设置表单的Entity对象到Post请求中
httpPost.setEntity(formEntity);
CloseableHttpResponse response =null;
try {
response = httpClient.execute(httpPost);
if(response.getStatusLine().getStatusCode()==200)
{
String content = EntityUtils.toString(response.getEntity(), "utf8");
}
}
catch (IOException e) {
e.printStackTrace();
}
finally{
try {
response.close();
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
使用连接池来调取HttpClient
package cn.itcast.crawler.test;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import sun.net.www.http.HttpClient;
import java.io.IOException;
public class HttpClientPool {
public static void main(String[] args) {
//创建连接池管理器
PoolingHttpClientConnectionManager cm=new PoolingHttpClientConnectionManager();
//设置连接数
cm.setMaxTotal(100);
//设置每个主机的最大连接数
cm.setDefaultMaxPerRoute(10);
//使用连接池管理器发起请求
doGet(cm);
}
private static void doGet(PoolingHttpClientConnectionManager cm) {
//每次执行使用HttpClient,不需要新建而是从连接池中获取 HttpClient对象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
HttpGet httpGet=new HttpGet("http://www.itcast.cn");
CloseableHttpResponse response=null;
try {
response = httpClient.execute(httpGet);
if(response.getStatusLine().getStatusCode()==200)
{
String content = EntityUtils.toString(response.getEntity(), "utf8");
System.out.println(content);
}
} catch (IOException e) {
e.printStackTrace();
}finally
{
if(response!=null)
{
try{
response.close();
} catch (IOException e) {
e.printStackTrace();
}
// 不能关闭 HttpClient, 由连接池管理 HttpClient
// httpClient. close();
}
}
}
}
请求参数的设置
就是在请求中设置一些访问的格式和时间
package cn.itcast.crawler.test;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import sun.misc.Request;
import java.io.IOException;
public class HttpConfigTest {
public static void main(String[] args) {
//创建HttpClient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
//创建HttpGet对象,设置url访问地址
HttpGet httpGet = new HttpGet("http://www.itcast.cn");
//配置请求信息
RequestConfig config = RequestConfig.custom().setConnectTimeout(1000)// 创建连接的最长时间, 单位是毫秒
.setConnectionRequestTimeout(500) // 设置获取连接的最长时间, 单位是毫秒
.setSocketTimeout(10*1000) // 设置数据传输的最长时间, 单位是毫秒
.build();
//给请求设置请求信息
httpGet.setConfig(config);
//使用HttpClient发起请求,获取response
CloseableHttpResponse response =null;
try {
response = httpClient.execute(httpGet);
if(response.getStatusLine().getStatusCode()==200)
{
String content = EntityUtils.toString(response.getEntity(), "utf8");
System.out.println(content.length());
}
}
catch (IOException e) {
e.printStackTrace();
}
//关闭访问请求
finally{
try {
response.close();
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
视频讲解
视频连接,共同学习进步