HttpClient和jsoup基础API使用

准备配置

pom

    <dependencies>
        <!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.2</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.slf4j/slf4j-log4j12 -->
        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-log4j12</artifactId>
            <version>1.7.24</version>
        </dependency>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.10</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.13.1</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/commons-io/commons-io -->
        <dependency>
            <groupId>commons-io</groupId>
            <artifactId>commons-io</artifactId>
            <version>2.3</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 -->
        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-lang3</artifactId>
            <version>3.4</version>
        </dependency>

    </dependencies>

log4j.properties

log4j.rootLogger=DEBUG,A1
log4j.logger.com.hikktn = DEBUG

log4j.appender.A1=org.apache.log4j.ConsoleAppender
log4j.appender.A1.layout=org.apache.log4j.PatternLayout
log4j.appender.A1.layout.ConversionPattern=%-d{yyyy-MM-dd HH:mm:ss,SSS} [%t] [%c]-[%p] %m%n

准备SpringDataJpa

一个springdatajpa简单项目,进行下面的测试。

https://blog.csdn.net/qq_41520636/article/details/115472103

启动相应的环境。

完成springdatajpa入门程序后,需要简单更改一下。

在UserController.java中增加或修改接口

    /*
		查询用户
	 */
	@GetMapping("/getOne")
	public UserEntity getUserInfo(@Param("userId") int userId) {
		Optional<UserEntity> optional = userRepository.findById(userId);
		return optional.orElseGet(UserEntity::new);
	}

	/*
		分页查询多个用户
	 */
	@GetMapping("/list")
	public Page<UserEntity> pageQuery(@RequestParam(value = "pageNum", defaultValue = "1") Integer pageNum,
	                                  @RequestParam(value = "pageSize", defaultValue = "10") Integer pageSize) {
		return userRepository.findAll(PageRequest.of(pageNum - 1, pageSize));
	}

	@PostMapping(path = "/testPostParam")
	public String getPostUser(@RequestParam("name") String name){
		return "测试POST请求:"+name;
	}

	@PostMapping(path = "/testPost")
	public String getUser(){
		return "测试POST请求:无参数";
	}

HttpClient

基础入门程序 get方式

package com.hikktn.test;

import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.io.IOException;

/**
 * @ClassName CrawlerFirst
 * @Description TODO
 * @Author lisonglin
 * @Date 2021/5/2 18:33
 * @Version 1.0
 */
public class CrawlerFirst {

	public static void main(String[] args) throws IOException {
		// 创建httpclient对象
		CloseableHttpClient httpClient = HttpClients.createDefault();
		// 创建httpget对象,设置url访问地址
		HttpGet httpGet = new HttpGet("http://localhost:8080/user/all");

		// 使用httpclient发送请求,获取响应数据
		CloseableHttpResponse response = httpClient.execute(httpGet);

		// 解析响应
		if (response.getStatusLine().getStatusCode() == 200) {
			HttpEntity entity = response.getEntity();
			String result = EntityUtils.toString(entity, "utf8");
			System.out.println(result);
		}
	}
}

输出

get有参方式

package com.hikktn.test;

import org.apache.http.HttpEntity;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.io.IOException;
import java.net.URISyntaxException;

/**
 * @ClassName HttpGetTest
 * @Description TODO
 * @Author lisonglin
 * @Date 2021/5/2 20:18
 * @Version 1.0
 */
public class HttpGetTest {

	public static void main(String[] args){
		// 创建httpclient对象
		CloseableHttpClient httpClient = HttpClients.createDefault();
		HttpGet httpGet =null;
		// 设置请求地址 https://www.bilibili.com/bangumi/play/ss38234?spm_id_from=333.851.b_62696c695f7265706f72745f616e696d65.61
		// 创建URIBuilder
		try {
			URIBuilder uriBuilder = new URIBuilder("http://localhost:8080/user/getOne");
			// 设置参数
			uriBuilder.setParameter("userId","1");

			httpGet = new HttpGet(uriBuilder.build());

		} catch (URISyntaxException e) {
			e.printStackTrace();
		}

		// 使用httpclient发送请求,获取响应数据
		CloseableHttpResponse response = null;
		try {
			// 配置信息
			RequestConfig requestConfig = RequestConfig.custom()
					// 设置连接超时时间(单位毫秒)
					.setConnectTimeout(5000)
					// 设置请求超时时间(单位毫秒)
					.setConnectionRequestTimeout(5000)
					// socket读写超时时间(单位毫秒)
					.setSocketTimeout(5000)
					// 设置是否允许重定向(默认为true)
					.setRedirectsEnabled(true).build();

			// 将上面的配置信息 运用到这个Get请求里
			httpGet.setConfig(requestConfig);

			response = httpClient.execute(httpGet);
			// 解析响应
			if (response.getStatusLine().getStatusCode() == 200) {
				HttpEntity entity = response.getEntity();
				String result = EntityUtils.toString(entity, "utf8");
				System.out.println(result);
			}
		} catch (IOException e) {
			e.printStackTrace();
		}finally {
			try {
				response.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
			try {
				httpClient.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}

	}
}

输出

post无参方式

package com.hikktn.test;

import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.io.IOException;
import java.net.URISyntaxException;

/**
 * @ClassName HttpGetTest
 * @Description TODO
 * @Author lisonglin
 * @Date 2021/5/2 20:18
 * @Version 1.0
 */
public class HttpPostTest {

	public static void main(String[] args) {
		// 创建httpclient对象
		CloseableHttpClient httpClient = HttpClients.createDefault();
		HttpPost httpPost = new HttpPost("http://localhost:8080/user/testPost");


		// 使用httpclient发送请求,获取响应数据
		CloseableHttpResponse response = null;
		try {
			response = httpClient.execute(httpPost);
			// 解析响应
			if (response.getStatusLine().getStatusCode() == 200) {
				String result = EntityUtils.toString(response.getEntity(), "utf8");
				System.out.println(result);
			}
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			try {
				response.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
			try {
				httpClient.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}

	}
}

输出

post有参方式

package com.hikktn.test;

import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;

/**
 * @ClassName HttpGetTest
 * @Description TODO
 * @Author lisonglin
 * @Date 2021/5/2 20:18
 * @Version 1.0
 */
public class HttpPostParamTest {

	public static void main(String[] args) {
		// 创建httpclient对象
		CloseableHttpClient httpClient = HttpClients.createDefault();
		HttpPost httpPost = new HttpPost("http://localhost:8080/user/testPostParam");

		// 使用list集合封装参数
		List<NameValuePair> params = new ArrayList<>();
		params.add(new BasicNameValuePair("name","hikktn"));
		try {
			// 创建表单的entity对象
			UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(params, "utf8");
			// 设置formEntity对象给post对象
			httpPost.setEntity(formEntity);
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
		}
		// httpPost.setHeader("Content-Type","application/json;charset=utf8");
		// 使用httpclient发送请求,获取响应数据
		CloseableHttpResponse response = null;
		try {
			response = httpClient.execute(httpPost);
			// 解析响应
			if (response.getStatusLine().getStatusCode() == 200) {
				String result = EntityUtils.toString(response.getEntity(), "utf8");
				System.out.println(result);
			}
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			try {
				response.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
			try {
				httpClient.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}

	}
}

输出

Jsoup

基础入门程序

解析文档 HTTPS协议

package com.hikktn.test;

import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLContext;
import javax.net.ssl.X509TrustManager;
import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.security.SecureRandom;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;

/**
 * @ClassName SSLHelper
 * @Description TODO
 * @Author lisonglin
 * @Date 2021/5/3 13:53
 * @Version 1.0
 */
public class SSLHelper {

	public static String USER_AGENT = "Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 5.0)";

	static public void init() {
		try {
			SSLContext context = SSLContext.getInstance("TLS");
			context.init(null, new X509TrustManager[]{new X509TrustManager() {
				public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {
				}

				public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {
				}

				public X509Certificate[] getAcceptedIssuers() {
					return new X509Certificate[0];
				}
			}}, new SecureRandom());
			HttpsURLConnection.setDefaultSSLSocketFactory(context.getSocketFactory());
		} catch (NoSuchAlgorithmException e) {
		} catch (KeyManagementException e) {
		}
	}
}
public class JsoupTest {

	public static String USER_AGENT = "Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 5.0)";
	@Test
	public void testUrl(){
		SSLHelper.init();
		try {
			// 解析url
			// Document document = Jsoup.parse(new URL("https://www.bilibili.com/"), 1000);
			Connection connect = Jsoup.connect("https://www.bilibili.com/").userAgent(USER_AGENT);
			connect.header("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
			connect.header("Accept-Encoding", "gzip, deflate, sdch");
			connect.header("Accept-Language", "zh-CN,zh;q=0.8");
			connect.timeout(3000);
			connect.ignoreHttpErrors(true);
			Document document = connect.get();
			// 获取title内容
			Element title = document.getElementsByTag("title").first();
			System.out.println(title.text());
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
}

输出

准备HTML

<html>
 <head> 
  <title>哔哩哔哩 (゜-゜)つロ 干杯~-bilibili</title> 
 </head> 
 <body>
	<div class="city">
		<h3 id="city_bj">北京中心</h3>
		<div class="city_in">
			<div class="city_con" style="display: none;">
				<ul>
					<li id="test" class="class_a class_b">
						<a href="https://www.bilibili.com/" target="_blank">
							<span class="s_name">北京</span>
						</a>
					</li>
					<li>
						<a href="" target="_blank">
							<span class="s_name">上海</span>
						</a>
					</li>
					<li>
						<a href="" target="_blank">
							<span abc="123" class="s_name">广州</span>
						</a>
					</li>
					<ul>
						<li>天津</li>
					</ul>					
				</ul>
			</div>
		</div>
	</div>
 </body>
</html>

读取HTML文件的标题

	@Test
	public void testStringTitle() throws IOException {
		// 获取字符串
		String html = FileUtils.readFileToString(new File("C:\\Users\\12134\\Desktop\\test.html"), "utf-8");

		Document doc = Jsoup.parse(html);

		String title = doc.getElementsByTag("title").first().text();
		System.out.println(title);
	}

输出

读取HTML文件的body

	@Test
	public void testStringBody() throws IOException {
		// 获取字符串
		String html = FileUtils.readFileToString(new File("C:\\Users\\12134\\Desktop\\test.html"), "utf-8");

		Document doc = Jsoup.parseBodyFragment(html);

		Element body = doc.body();

		System.out.println(body);

	}

输出:

操作dom

    @Test
	public void testDom() throws IOException {
		// 获取字符串
		String html = FileUtils.readFileToString(new File("C:\\Users\\12134\\Desktop\\test.html"), "utf-8");

		Document doc = Jsoup.parse(html);

		// jQuery选择器
		Elements el1 = doc.select("#city_bj");
		System.out.println(el1);
		System.out.println("===========");

		// id 选择器
		Element test = doc.getElementById("test");
		System.out.println(test);

		// 获取id
		String id = test.id();
		System.out.println("id:"+id);

		// 获取class名字
		String className = test.className();
		System.out.println("className:"+className);

		// 获取标签内的属性
		String id1 = test.attr("id");
		System.out.println("attr_id:"+id1);

		// 获取标签内所有的属性
		String all = test.attributes().toString();
		System.out.println("all"+all);

		// 获取文本内容
		String text = test.text();
		System.out.println(text);
		System.out.println("===========");

		// CSS样式选择器
		Elements s_name = doc.getElementsByClass("class_a");
		System.out.println(s_name);

		System.out.println("===========");

		// 标签选择器
		Elements span = doc.getElementsByTag("span");
		System.out.println(span);

		System.out.println("===========");

		// 自定义属性选择器
		Elements abc = doc.getElementsByAttribute("abc");
		System.out.println(abc);

		System.out.println("===========");
		// 自定义属性选择器 class="class_a class_b" 中有两个CSS样式无效
		Elements class_a = doc.getElementsByAttributeValue("class", "s_name");
		System.out.println(class_a);
	}

输出

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 4
    评论
使用Java编写基于HttpClientJsoup的爬虫,需要进行以下步骤: 1. 首先,导入HttpClientJsoup的依赖包。可以使用maven或gradle进行依赖管理。 2. 创建一个HttpClient实例,用于发送HTTP请求和接收响应。可以使用HttpClients.createDefault()方法创建一个默认配置的实例。 3. 创建一个HttpGet实例,设置请求URL和请求头信息。可以使用new HttpGet(url)方法创建一个HttpGet实例,然后使用setHeader()方法设置请求头信息。 4. 发送HTTP请求,并获取响应结果。可以使用HttpClient.execute()方法发送请求,并使用HttpResponse.getEntity()方法获取响应实体。 5. 解析HTML内容。可以使用Jsoup.parse()方法解析HTML内容,然后使用Jsoup提供的API进行内容提取和处理。 以下是一个使用HttpClientJsoup进行网页爬取的示例代码: ```java import org.apache.http.HttpResponse; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import java.io.IOException; public class WebCrawler { public static void main(String[] args) throws IOException { // 创建一个HttpClient实例 HttpClient httpClient = HttpClients.createDefault(); // 创建一个HttpGet实例,设置请求URL和请求头信息 HttpGet httpGet = new HttpGet("https://www.example.com"); httpGet.setHeader("User-Agent", "Mozilla/5.0"); // 发送HTTP请求,并获取响应结果 HttpResponse httpResponse = httpClient.execute(httpGet); String html = EntityUtils.toString(httpResponse.getEntity(), "UTF-8"); // 解析HTML内容 Document document = Jsoup.parse(html); String title = document.title(); System.out.println("Title: " + title); } } ``` 在这个示例中,我们使用HttpClient发送了一个GET请求到https://www.example.com,并获取了响应结果。然后使用Jsoup解析HTML内容,并获取了网页的标题。
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

hikktn

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值