jsoup的使用
这里面都是一些测试文件,用于以后参考使用
/**
* @deprecated 这是老方式爬取了,效率太慢了
* https://www.bilibili.com/video/BV1uf4y127Ab
* 小时候看笑话,长大后看真实
* //space.bilibili.com/13899470
* 喜娃桑
*/
public static List<BVideoRank> bVideoRankArraylist(Integer BvRankzone) {
List<BVideoRank> bVideoRankList = new ArrayList<>();
log.info("==========耐心等待几分钟==========");
// 解析页面(jsoup返回document就是document对象)
//Document document = jsoupUtil.getHtmlContent(bilibiliConstants.RANK_URL_PREFIX + LeaderboardTypeConstants.ALL.getValue() + "/" + BvRankzone + bilibiliConstants.RANK_URL_SUFFIX);
//int videoRankCount = bilibiliConstants.VIDEO_DATA_FLAG;
//for (Element element : document.select("li[class=rank-item]")) {
// String bvNumber = element.select(".img").select("a").attr("href").split("/")[4];
// String bvTitle = BVStringUtil.filterEmoji(element.select(".img").select("img").attr("alt"));
// String bvUpuuid = element.select(".detail").select("a").attr("href").split("/")[3];
// String bvUp = BVStringUtil.filterEmoji(element.select(".detail").select("a").select("span").text());
// String bvScore = element.select(".pts").text().split(" ")[0];
// BVideoRank bVideoRank = new BVideoRank();
// bVideoRank.setBvNumber(bvNumber);
// bVideoRank.setBvRanknum(videoRankCount);
// bVideoRank.setBvTitle(bvTitle);
// bVideoRank.setBvTime(DateUtils.getLocalCurrentDate());
// bVideoRank.setBvRankzone(BvRankzone);
// bVideoRank.setBvScore(bvScore);
// bVideoRank.setBvUp(bvUp);
// bVideoRank.setBvUpuuid(bvUpuuid);
// bVideoRankList.add(bVideoRank);
// videoRankCount++;
//}
return bVideoRankList;
}
import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLContext;
import javax.net.ssl.X509TrustManager;
import java.security.SecureRandom;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
public class HttpsUtil {
/**
* 信任任何站点,实现https页面的正常访问
*/
public static void trustEveryone() {
try {
HttpsURLConnection.setDefaultHostnameVerifier((hostname, session) -> true);
SSLContext context = SSLContext.getInstance("TLS");
context.init(null, new X509TrustManager[]{new X509TrustManager() {
@Override
public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {
}
@Override
public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {
}
@Override
public X509Certificate[] getAcceptedIssuers() {
return new X509Certificate[0];
}
}}, new SecureRandom());
HttpsURLConnection.setDefaultSSLSocketFactory(context.getSocketFactory());
} catch (Exception e) {
// e.printStackTrace();
}
}
}
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.IOException;
import java.net.MalformedURLException;
public class httptest {
public static void main(String[] args) {
//WebClient webClient = new WebClient(BrowserVersion.CHROME);
支持JavaScript
//webClient.getOptions().setJavaScriptEnabled(true);//启用JS解释器,默认为true
//webClient.getOptions().setCssEnabled(false);//禁用css支持
//webClient.getOptions().setActiveXNative(false);
//webClient.getOptions().setCssEnabled(false);
//webClient.getOptions().setThrowExceptionOnScriptError(false);
//webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
//webClient.getOptions().setTimeout(1000);
//webClient.getOptions().setUseInsecureSSL(true);
//HtmlPage rootPage = null;
//try {
// rootPage = webClient.getPage("https://www.bilibili.com/video/BV1Hk4y1r7D3");
//} catch (FailingHttpStatusCodeException | IOException e) {
// e.printStackTrace();
//}
设置一个运行JavaScript的时间
//webClient.waitForBackgroundJavaScript(1000);
//assert rootPage != null;
//String html = rootPage.asXml();
//Document doc = Jsoup.parse(html);
//
//
//System.out.println(doc.getElementsByClass("ops").html());
}
}
package com.site.bdata.test;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLContext;
import javax.net.ssl.X509TrustManager;
import java.io.IOException;
import java.security.SecureRandom;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
public class JsoupSSL {
public static void main(String[] args) throws IOException {
Document document = Jsoup.connect("https://www.bilibili.com/video/online.html").validateTLSCertificates(true).get();
System.out.println(document);
}
/**
* 现在很多站点都是SSL对数据传输进行加密,这也让普通的HttpConnection无法正常的获取该页面的内容,
* 而Jsoup起初也对此没有做出相应的处理,
* 想了一下是否可以让Jsoup可以识别所有的SSL加密过的页面,查询了一些资料,发现可以为本地HttpsURLConnection配置一个“万能证书”,其原理是就是:
* 重置HttpsURLConnection的DefaultHostnameVerifier,使其对任意站点进行验证时都返回true
* 重置httpsURLConnection的DefaultSSLSocketFactory, 使其生成随机证书
* 后来Jsoup Connection提供了validateTLSCertificates(boolean validate)//是否进行TLS证书验证,不推荐
*/
static {
try {
// 重置HttpsURLConnection的DefaultHostnameVerifier,使其对任意站点进行验证时都返回true
HttpsURLConnection.setDefaultHostnameVerifier((hostname, session) -> true);
// 创建随机证书生成工厂
SSLContext context = SSLContext.getInstance("TLS");
// SSLContext context = SSLContext.getInstance("TLSv1.2");
context.init(null, new X509TrustManager[]{new X509TrustManager() {
@Override
public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {
}
@Override
public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {
}
@Override
public X509Certificate[] getAcceptedIssuers() {
return new X509Certificate[0];
}
}}, new SecureRandom());
// 重置httpsURLConnection的DefaultSSLSocketFactory, 使其生成随机证书
HttpsURLConnection.setDefaultSSLSocketFactory(context.getSocketFactory());
} catch (Exception e) {
e.printStackTrace();
}
}
}
import java.io.IOException;
import java.util.logging.Level;
public class jsouptest {
public static void main(String[] args) throws IOException {
// String url = "https://www.toutiao.com/";
Connection connect = Jsoup.connect(url);
Document document = connect.get();
System.out.println(document);
//
// //构造一个webClient 模拟Chrome 浏览器
// WebClient webClient = new WebClient(BrowserVersion.CHROME);
屏蔽日志信息
LogFactory.getLog().setAttribute("org.apache.commons.logging.Log", "org.apache.commons.logging.impl.NoOpLog");
java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF);
支持JavaScript
// webClient.getOptions().setJavaScriptEnabled(true);
// webClient.getOptions().setCssEnabled(false);
// webClient.getOptions().setActiveXNative(false);
// webClient.getOptions().setCssEnabled(false);
// webClient.getOptions().setThrowExceptionOnScriptError(false);
// webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
// webClient.getOptions().setTimeout(5000);
// HtmlPage rootPage = webClient.getPage(url);
// //设置一个运行JavaScript的时间
// webClient.waitForBackgroundJavaScript(5000);
// String html = rootPage.asXml();
//
// Document document = Jsoup.parse(html);
// System.out.println(document);
}
}
package com.site.bdata.util;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.net.MalformedURLException;
import java.net.URL;
/**
* @author Programmer Li
*/
public class jsoupUtil {
//private static String getHtmlPageResponse(String url) throws Exception {
// //请求超时时间,默认200秒
// int timeout = 9000;
// //等待异步JS执行时间,默认200秒
// int waitForBackgroundJavaScript = 9000;
// String result = "";
// final WebClient webClient = new WebClient(BrowserVersion.CHROME);
// //当JS执行出错的时候是否抛出异常
// webClient.getOptions().setThrowExceptionOnScriptError(false);
// //当HTTP的状态非200时是否抛出异常
// webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
// webClient.getOptions().setActiveXNative(false);
// // -----重点-----设置为我们自定义的错误处理类
// webClient.setJavaScriptErrorListener(new MyJSErrorListener());
//
// //是否启用CSS
// webClient.getOptions().setCssEnabled(false);
// //很重要,启用JS
// webClient.getOptions().setJavaScriptEnabled(true);
// //很重要,设置支持AJAX
// webClient.setAjaxController(new NicelyResynchronizingAjaxController());
// //设置“浏览器”的请求超时时间
// webClient.getOptions().setTimeout(timeout);
// //设置JS执行的超时时间
// webClient.setJavaScriptTimeout(timeout);
// HtmlPage page;
// try {
// page = webClient.getPage(url);
// } catch (Exception e) {
// webClient.close();
// throw e;
// }
// //该方法阻塞线程
// webClient.waitForBackgroundJavaScript(waitForBackgroundJavaScript);
// result = page.asXml();
// webClient.close();
// return result;
//}
//public static Document getHtmlContent(String url){
// // 发起请求
// String content = null;
// try {
// content = getHtmlPageResponse(url);
// } catch (Exception e) {
// e.printStackTrace();
// }
// // 解析网页 得到文档对象
// return Jsoup.parse(content);
//}
/**
* 忽略html unit打印的所有js加载报错信息
*/
//public static class MyJSErrorListener extends DefaultJavaScriptErrorListener {
// @Override
// public void scriptException(HtmlPage page, ScriptException scriptException) {
// }
//
// @Override
// public void timeoutError(HtmlPage page, long allowedTime, long executionTime) {
// }
//
// @Override
// public void malformedScriptURL(HtmlPage page, String url, MalformedURLException malformedURLException) {
//
// }
//
// @Override
// public void loadScriptError(HtmlPage page, URL scriptUrl, Exception exception) {
//
// }
//
// @Override
// public void warn(String message, String sourceName, int line, String lineSource, int lineOffset) {
//
// }
//}
//public static void main(String[] args) {
// Document htmlContent = getHtmlContent("https://www.bilibili.com/video/BV1Vk4y1r7qs");
// System.out.println(htmlContent.getElementsByClass("ops").html());
//
//}
}