1.需要导入的资源
<dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.9.2</version> </dependency> <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> <version>4.5.13</version> </dependency>
2.这是我自己写的工具类,用于模拟get和Post请求, headers是需要自己填的请求头信息
import org.apache.http.HttpEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpPost; import org.apache.http.entity.StringEntity; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import java.io.BufferedReader; import java.io.InputStream; import java.io.InputStreamReader; import java.util.Map; import java.util.zip.GZIPInputStream; public class HTTPClientUtil { //发送get请求 public String testGet(String url, Map<String, String> headers) throws Exception { System.out.println("请求方式:GET"); System.out.println("请求URL:"+url); String re=""; // 创建HttpClient对象 CloseableHttpClient httpClient = HttpClients.createDefault(); // 创建GET请求 HttpGet httpGet = new HttpGet(url); { for (String key:headers.keySet()){ httpGet.setHeader(key,headers.get(key)); } } // 获取响应结果 CloseableHttpResponse response = httpClient.execute(httpGet); System.out.println("响应码:"+response.getStatusLine().getStatusCode()); if(response.getStatusLine().getStatusCode()==200) { String result = ParseResponse(response); httpClient.close(); response.close(); return result; }else{ httpClient.close(); response.close(); return null; } } //发送post请求 public String testPost(Map<String, String> data,Map<String, String> headers) throws Exception { System.out.println("请求方式:POST"); System.out.println("请求url:"+data.get("url")); // 创建HttpClient对象 CloseableHttpClient httpClient = HttpClients.createDefault(); // 创建GET请求 HttpPost httpGet = new HttpPost(data.get("url")); { for (String key:headers.keySet()){ httpGet.setHeader(key,headers.get(key)); } } // 设置请求体 if(data.containsKey("data")) { System.out.println("请求体:"+data.get("data")); StringEntity params = new StringEntity(data.get("data")); httpGet.setEntity(params); } // 获取响应结果 CloseableHttpResponse response = httpClient.execute(httpGet); System.out.println("响应码:"+response.getStatusLine().getStatusCode()); if(response.getStatusLine().getStatusCode()==200) { String result = ParseResponse(response); httpClient.close(); response.close(); return result; }else{ httpClient.close(); response.close(); return null; } } public String ParseResponse(CloseableHttpResponse response){ String re=null; try { HttpEntity entity = response.getEntity(); // 检查响应是否被gzip压缩 if (false) { System.out.println("检查响应被gzip压缩"); InputStream gzipStream = entity.getContent(); InputStreamReader gzipStreamReader = new InputStreamReader(new GZIPInputStream(gzipStream)); BufferedReader bufferedReader = new BufferedReader(gzipStreamReader); StringBuilder stringBuilder = new StringBuilder(); String line; while ((line = bufferedReader.readLine()) != null) { System.out.println("打印内容:"+line); stringBuilder.append(line).append("\n"); } // 打印未压缩的HTML内容 re=stringBuilder.toString(); } else { // 如果不是gzip压缩,直接打印内容 re= EntityUtils.toString(entity,"UTF-8" ); } }catch (Exception e){ e.printStackTrace(); } return re; } }
3.这是爬虫一个网站 用到Jsoup的部分方法,仅供参考
import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; public class JsoupUtil { // 辅助方法:判断一个字符是否是汉字(这里简化判断,只判断是否在汉字字符集范围内) private static boolean isHanzi(char c) { return c >= '\u4e00' && c <= '\u9fa5'; } private Map<String, Object> parseHtml_text_page(String html) { Map<String, Object> map = new HashMap<>(); try { String text = ""; List<String> list = new ArrayList<>(); // 获取html的文档对象 Document doc = Jsoup.parse(html); try { 网页的文章内容 Element content = doc.getElementById("chapterinfo"); text = content.html(); } catch (Exception e) { e.printStackTrace(); } //解析页数 try { //根据class找组件 Elements element2 = doc.getElementsByClass("chapterPages"); //根据id找组件 //doc.getElementById() String arr2 = element2.text(); if (arr2 != null && arr2.length() > 0) { System.out.println(arr2); String[] split = arr2.split("【"); //【1】【2】【3】【4】【5】【6】 if (split.length > 1) { for (int i = 1; i < split.length; i++) { //获取a标签包含的内容获取a标签 Elements select = doc.select("a:contains(" + "【" + split[i] + ")"); String href = select.get(0).attr("href"); if (href.contains("html")) list.add(href); } } } } catch (Exception e) { e.printStackTrace(); } map.put("text", ""); map.put("list", list); System.out.println("章节页数:" + list.toString()); //System.out.println("章节文本:"+text); } catch (Exception e) { e.printStackTrace(); } return map; } }