目录
2.1 HttpClient--Get 修改成连接池(直接看这个)
准备工作
导入依赖
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.2</version>
</dependency>
yml
logging:
level:
root: info
com.lrm: debug
1.入门程序(获取到静态页面)
package com.itheima.reggie.utils;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
/**
* @Author lpc
**/
public class CrawlerFirst {
public static void main(String[] args) throws Exception {
//1.打开浏览器,创建Httpclient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
//2.输入网址,发起get请求创建HttpGet对象
HttpGet httpGet = new HttpGet("https://www.itcast.cn/");
//3.按回车,发起请求,返回响应,使用Httpclient对象发起请求
CloseableHttpResponse response = httpClient.execute(httpGet);
//4.解析响应,获取数据
//判斯状态码是否是200
if (response.getStatusLine().getStatusCode()==200){
HttpEntity httpEntity = response.getEntity();
//获取前端静态页面
String content = EntityUtils.toString(httpEntity,"utf8");
System.out.println(content);
}
}
}
2.HttpClient---Get
package com.itheima.reggie.utils;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
/**
* @Author lpc
* @Date 2024 03 12 00 23
**/
public class CrawlerFirst {
public static void main(String[] args){
//1.打开浏览器,创建Httpclient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
//2.输入网址,发起get请求创建HttpGet对象
HttpGet httpGet = new HttpGet("https://www.itcast.cn/");
//3.按回车,发起请求,返回响应,使用Httpclient对象发起请求
CloseableHttpResponse response = null;
try {
response = httpClient.execute(httpGet);
//4.解析响应,获取数据
//判斯状态码是否是200
if (response.getStatusLine().getStatusCode()==200){
HttpEntity httpEntity = response.getEntity();
//获取前端静态页面
String content = EntityUtils.toString(httpEntity,"utf8");
System.out.println(content.length());
}
} catch (IOException e) {
throw new RuntimeException(e);
}finally {
try {
//关闭response
response.close();
} catch (IOException e) {
throw new RuntimeException(e);
}
try {
//关闭浏览器
httpClient.close();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
}
2.1 HttpClient--Get 修改成连接池(直接看这个)
如果每次请求都要创建HttpClient,会有频繁创建和销毁的问题,可以使用连接池来解决这个问题。·
测试以下代码,并断点查看每次获取的HttpClient都是不一样的。。
package org.example;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
/**
* @Author lpc
* @Date 2024 03 14 09 38
**/
public class Test {
public static void main(String[] args) {
//创建连接池
PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
//设置最大连接数
cm.setMaxTotal(100);
//设置每个主机的最大连接数
cm.setDefaultMaxPerRoute(10);
//使用连接池管理器发起请求
doGet(cm);
}
public static void doGet(PoolingHttpClientConnectionManager cm){
//不是每次创建新的httpClient,而是从连接池中获取HttpClient对象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
HttpGet httpGet = new HttpGet("http://www.itcast.cn");
CloseableHttpResponse response=null;
try {
response = httpClient.execute(httpGet);
if (response.getStatusLine().getStatusCode()==200){
String content = EntityUtils.toString(response.getEntity(), "utf8");
System.out.println(content.length());
}
} catch (IOException e) {
throw new RuntimeException(e);
}finally {
if (response!=null){
try {
response.close();
} catch (IOException e) {
throw new RuntimeException(e);
}
//不能关闭,由连接池管理
// httpClient.close();
}
}
}
}
3.HttpClient---Get带参数
package org.example;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.net.URISyntaxException;
/**
* @Author lpc
* @Date 2024 03 13 20 44
**/
public class Test2 {
public static void main(String[] args) throws Exception {
//1.打开浏览器
CloseableHttpClient httpClient = HttpClients.createDefault();
//设置请求地址是: http://yun.itheima.com/search?keys=Java
//带参数的get方法设置
//创建URIBuilder
URIBuilder uriBuilder = new URIBuilder("http://yun.itheima.com/search");
//设置参数 可以设置多个
uriBuilder.setParameter("keys","Java");
//2.输入网址,发起get请求创建HttpGet对象
HttpGet httpGet = new HttpGet(uriBuilder.build());
System.out.println("发起请求的信息"+httpGet);
//3.
CloseableHttpResponse response=null;
try {
response = httpClient.execute(httpGet);
if (response.getStatusLine().getStatusCode()==200){
HttpEntity httpEntity = response.getEntity();
//
String s = EntityUtils.toString(httpEntity, "utf8");
System.out.println(s);
}
} catch (IOException e) {
throw new RuntimeException(e);
}finally {
try {
response.close();
} catch (IOException e) {
throw new RuntimeException(e);
}
try {
httpClient.close();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
}
3.1 修改成连接池
4.HttpClient---Post
package org.example;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
/**
* @Author lpc
* @Date 2024 03 13 20 59
**/
public class Post {
public static void main(String[] args) {
//1.打开浏览器
CloseableHttpClient httpClient = HttpClients.createDefault();
//2.输入网址,发起get请求创建HttpGet对象
//HttpGet httpGet = new HttpGet("https://www.itcast.cn/");
HttpPost httpPost = new HttpPost("https://www.itcast.cn/");
//3.
CloseableHttpResponse response=null;
try {
// response = httpClient.execute(httpGet);
response = httpClient.execute(httpPost);
if (response.getStatusLine().getStatusCode()==200){
HttpEntity httpEntity = response.getEntity();
//
String s = EntityUtils.toString(httpEntity, "utf8");
System.out.println(s);
}
} catch (IOException e) {
throw new RuntimeException(e);
}finally {
try {
response.close();
} catch (IOException e) {
throw new RuntimeException(e);
}
try {
httpClient.close();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
}
4.1 修改成连接池
package org.example;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
/**
* @Author lpc
* @Date 2024 03 14 10 02
**/
public class Postl {
public static void main(String[] args){
//创建连接池管理器
PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
//设置最大连接数
cm.setMaxTotal(100);
//设置每个主机最大连接数
cm.setDefaultMaxPerRoute(10);
//发起请求
doPost(cm);
}
private static void doPost(PoolingHttpClientConnectionManager cm) {
//不是每次创建新的httpClient,而是从连接池中获取HttpClient对象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
//2.输入网址 发起Post请求
HttpPost httpPost = new HttpPost("http://yun.itheima.com/search");
CloseableHttpResponse response=null;
try {
response = httpClient.execute(httpPost);
if (response.getStatusLine().getStatusCode()==200){
String s = EntityUtils.toString(response.getEntity());
System.out.println(s.length());
}
} catch (IOException e) {
throw new RuntimeException(e);
}finally {
if (response!=null){
try {
response.close();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
//不用关闭,由连接池管理
// httpClient.close();
}
}
}
5.HttpClient---Post带参数
package org.example;
import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
/**
* @Author lpc
* @Date 2024 03 13 20 59
**/
public class Post {
public static void main(String[] args) throws Exception {
//1.打开浏览器
CloseableHttpClient httpClient = HttpClients.createDefault();
//2.输入网址 发起Post请求
HttpPost httpPost = new HttpPost("http://yun.itheima.com/search");
//声明List集合,封装表单中的参数
List<NameValuePair> params =new ArrayList<NameValuePair>();
//设置请求地址是: http://yun.itheima.com/search?keys=Java
params.add(new BasicNameValuePair("keys","Java"));
//创建表单的Entity对象,第一个参数就是封装的表单数据,第二个参数就是编码
UrlEncodedFormEntity urlEncodedFormEntity = new UrlEncodedFormEntity(params,"utf8");
//设置表单的Entity对象到Post请求中
httpPost.setEntity(urlEncodedFormEntity);
CloseableHttpResponse response=null;
try {
// response = httpClient.execute(httpGet);
response = httpClient.execute(httpPost);
if (response.getStatusLine().getStatusCode()==200){
HttpEntity httpEntity = response.getEntity();
//
String s = EntityUtils.toString(httpEntity, "utf8");
System.out.println(s);
}
} catch (IOException e) {
throw new RuntimeException(e);
}finally {
try {
response.close();
} catch (IOException e) {
throw new RuntimeException(e);
}
try {
httpClient.close();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
}
6.HttpClient-连接池
如果每次请求都要创建HttpClient,会有频繁创建和销毁的问题,可以使用连接池来解决这个问题。·
测试以下代码,并断点查看每次获取的HttpClient都是不一样的。。
package org.example;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
/**
* @Author lpc
* @Date 2024 03 14 09 38
**/
public class Test {
public static void main(String[] args) {
//创建连接池
PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
//设置最大连接数
cm.setMaxTotal(100);
//设置每个主机的最大连接数
cm.setDefaultMaxPerRoute(10);
//使用连接池管理器发起请求
doGet(cm);
}
public static void doGet(PoolingHttpClientConnectionManager cm){
//不是每次创建新的httpClient,而是从连接池中获取HttpClient对象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
HttpGet httpGet = new HttpGet("http://www.itcast.cn");
CloseableHttpResponse response=null;
try {
response = httpClient.execute(httpGet);
if (response.getStatusLine().getStatusCode()==200){
String content = EntityUtils.toString(response.getEntity(), "utf8");
System.out.println(content.length());
}
} catch (IOException e) {
throw new RuntimeException(e);
}finally {
if (response!=null){
try {
response.close();
} catch (IOException e) {
throw new RuntimeException(e);
}
//不能关闭,由连接池管理
// httpClient.close();
}
}
}
}
7.设置请求信息
package org.example;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
/**
* @Author lpc
* @Date 2024 03 14 09 38
**/
public class Test {
public static void main(String[] args) {
//创建连接池
PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
//设置最大连接数
cm.setMaxTotal(100);
//设置每个主机的最大连接数
cm.setDefaultMaxPerRoute(10);
//使用连接池管理器发起请求
doGet(cm);
}
public static void doGet(PoolingHttpClientConnectionManager cm){
//不是每次创建新的httpClient,而是从连接池中获取HttpClient对象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
HttpGet httpGet = new HttpGet("http://www.itcast.cn");
//配置请求信息
RequestConfig config=RequestConfig.custom().setConnectTimeout(1000) //创建连接的最长时间,单位是毫秒
.setConnectionRequestTimeout(500)//设置获取连接的最长时间
.setSocketTimeout(10*1000)//设置数据传输的最长时间
.build();
//给请求设置请求信息
httpGet.setConfig(config);
CloseableHttpResponse response=null;
try {
response = httpClient.execute(httpGet);
if (response.getStatusLine().getStatusCode()==200){
String content = EntityUtils.toString(response.getEntity(), "utf8");
System.out.println(content.length());
}
} catch (IOException e) {
throw new RuntimeException(e);
}finally {
if (response!=null){
try {
response.close();
} catch (IOException e) {
throw new RuntimeException(e);
}
//不能关闭,由连接池管理
// httpClient.close();
}
}
}
}
8.jsoup介绍.
jsoup是一款Java 的 HTML解析器,可直接解析某个URL地址、HTML文木内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据。.
jsoup 的主要功能如下:
1.从一个 URL,文件或字符串中解析HTML;
2.使用DOM或CSS选择器来查找、取出数据;
3.可操作HTML元素、属性、文本;·
依赖
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.13.1</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/commons-io/commons-io -->
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.4</version>
</dependency>
<!-- lang3 -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.8.1</version>
</dependency>
9.jsoup解析url
package jsoup;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.junit.Test;
import java.net.MalformedURLException;
import java.net.URL;
/**
* @Author lpc
* @Date 2024 03 14 10 44
**/
public class jsoupTestFirst {
@Test
public void testJsoupUrl() throws Exception {
//解析URL地址
Document parse = Jsoup.parse(new URL("http://www.itcast.cn"), 10*1000);
//获取title的内容
Element title = parse.getElementsByTag("title").first();
System.out.println(title.text());
}
}
10.jsoup解析字符串
package jsoup;
import org.apache.commons.io.FileUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.junit.Test;
import java.io.File;
import java.net.MalformedURLException;
import java.net.URL;
/**
* @Author lpc
* @Date 2024 03 14 10 44
**/
public class jsoupTestFirst {
@Test
public void testString() throws Exception {
//使用工具读取文件,获取字符串
String file = FileUtils.readFileToString(new File("D:\\file.html"), "utf8");
//解析字符串
Document document = Jsoup.parse(file);
//获取title的内容
String title = document.getElementsByTag("title").first().text();
System.out.println(title);
}
}
11.jsoup解析文件
@Test
public void testFile() throws Exception {
//解析文件
Document parse = Jsoup.parse(new File("D:\\file.html"), "utf8");
String title = parse.getElementsByTag("title").first().text();
System.out.println(title);
}
12.所有dom方式获取元素
@Test
public void testDom() throws Exception {
//解析文件,获取Document对象
Document parse = Jsoup.parse(new File("D:\\file.html"), "utf8");
//获取元素
//1.
//Element elementById = parse.getElementById("popupMenu");
//2.
//Element elementById=parse.getElementsByTag("span").first();
//3.
// Elements elementById = parse.getElementsByClass("city_nav");
//4.
Elements elementById=parse.getElementsByAttribute("abc");
System.out.println(elementById.text());
}
13.元素中获取数据
@Test
public void testData() throws Exception {
//解析文件
Document parse = Jsoup.parse(new File("D:\\file.html"), "utf8");
//根据id获取元素
Element elementById = parse.getElementById("test");
System.out.println(elementById);
//1.从元素中获取id
String str1=elementById.id();
System.out.println(str1);
//2.从元素中获取className
String str2=elementById.className();
System.out.println(str2);
//3.从元素获取attr的值
String str3=elementById.attr("id");
System.out.println(str3);
//4。从元素中获取所有属性
Attributes attributes = elementById.attributes();
System.out.println(attributes);
//5.从元素中获取文本内容
String str4=elementById.text();
System.out.println(str4);
}
14.选择器方式获取元素
@Test
public void testSeletor() throws Exception {
//解析文件
Document parse = Jsoup.parse(new File("D:\\file.html"), "utf8");
//tagname: 通过标签查找元素,比如: span
// parse.select("span").forEach(System.out::println);
// id:通过ID查找元素,比如l:# city_bj
/* Element test = parse.select("test").first();
System.out.println(test);*/
//.class:通过class 名称查找元素,比如: .class_a
/*Element inner = parse.select("inner").first();
System.out.println(inner);*/
// [attribute]:利用属性查找元素,比如:[abc]
/* Element first = parse.select("[abc]").first();
System.out.println(first.text());*/
// [attr=value]:利用属性值来查找元素,比如: [class=s_name]
parse.select("[class=s_name]").forEach(System.out::println);
}
15.Selector选择器组合使用
@Test
public void testSeletor2() throws Exception {
//解析文件
Document parse = Jsoup.parse(new File("D:\\file.html"), "utf8");
//el#id:元素+ID,比如:63#city_bj
Element element = parse.select("h3city_id").first();
// el.class:元素+class,比如:li.class_au
Element element1 = parse.select("li.class_a").first();
// el[attr]:元素+属性名,比如:span[abc]
Element first = parse.select("span[abc]").first();
System.out.println(first);
// 任意组合:比如: span[abc].s_name.
Element first2 = parse.select("span[abc].s_name").first();
System.out.println(first2);
// ancestor child:查找某个元素下子元素,比如: .city_con li查找"city_con"下的所有li
parse.select(".city_con li").forEach(System.out::println);
//eparent > child:查找某个父元素下的直接子元素,比如:,
//.city_con > ul > li查找city_con第一级(直接子元素)的ul,再找所有ul下的第一级li
Elements select = parse.select(".city_con > ul > li");
// parent >*:查找某个父元素下所有直接子元素,
Elements select1 = parse.select(".city_con > ul > *");
}
16.