jsoup的使用

jsoup的使用

这里面都是一些测试文件,用于以后参考使用

 /**
     * @deprecated 这是老方式爬取了,效率太慢了
     * https://www.bilibili.com/video/BV1uf4y127Ab
     * 小时候看笑话,长大后看真实
     * //space.bilibili.com/13899470
     * 喜娃桑
     */
    public static List<BVideoRank> bVideoRankArraylist(Integer BvRankzone) {

        List<BVideoRank> bVideoRankList = new ArrayList<>();
        log.info("==========耐心等待几分钟==========");
        //        解析页面(jsoup返回document就是document对象)
        //Document document = jsoupUtil.getHtmlContent(bilibiliConstants.RANK_URL_PREFIX + LeaderboardTypeConstants.ALL.getValue() + "/" + BvRankzone + bilibiliConstants.RANK_URL_SUFFIX);
        //int videoRankCount = bilibiliConstants.VIDEO_DATA_FLAG;
        //for (Element element : document.select("li[class=rank-item]")) {
        //    String bvNumber = element.select(".img").select("a").attr("href").split("/")[4];
        //    String bvTitle = BVStringUtil.filterEmoji(element.select(".img").select("img").attr("alt"));
        //    String bvUpuuid = element.select(".detail").select("a").attr("href").split("/")[3];
        //    String bvUp = BVStringUtil.filterEmoji(element.select(".detail").select("a").select("span").text());
        //    String bvScore = element.select(".pts").text().split(" ")[0];
        //    BVideoRank bVideoRank = new BVideoRank();
        //    bVideoRank.setBvNumber(bvNumber);
        //    bVideoRank.setBvRanknum(videoRankCount);
        //    bVideoRank.setBvTitle(bvTitle);
        //    bVideoRank.setBvTime(DateUtils.getLocalCurrentDate());
        //    bVideoRank.setBvRankzone(BvRankzone);
        //    bVideoRank.setBvScore(bvScore);
        //    bVideoRank.setBvUp(bvUp);
        //    bVideoRank.setBvUpuuid(bvUpuuid);
        //    bVideoRankList.add(bVideoRank);
        //    videoRankCount++;
        //}

        return bVideoRankList;

    }
import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLContext;
import javax.net.ssl.X509TrustManager;
import java.security.SecureRandom;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;

public class HttpsUtil {


    /**
     * 信任任何站点,实现https页面的正常访问
     */

    public static void trustEveryone() {
        try {
            HttpsURLConnection.setDefaultHostnameVerifier((hostname, session) -> true);
            SSLContext context = SSLContext.getInstance("TLS");
            context.init(null, new X509TrustManager[]{new X509TrustManager() {
                @Override
                public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {
                }

                @Override
                public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {
                }

                @Override
                public X509Certificate[] getAcceptedIssuers() {
                    return new X509Certificate[0];
                }
            }}, new SecureRandom());
            HttpsURLConnection.setDefaultSSLSocketFactory(context.getSocketFactory());
        } catch (Exception e) {
            // e.printStackTrace();
        }
    }
}



import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import java.io.IOException;
import java.net.MalformedURLException;

public class httptest {
    public static void main(String[] args) {

        //WebClient webClient = new WebClient(BrowserVersion.CHROME);
        支持JavaScript
        //webClient.getOptions().setJavaScriptEnabled(true);//启用JS解释器,默认为true
        //webClient.getOptions().setCssEnabled(false);//禁用css支持
        //webClient.getOptions().setActiveXNative(false);
        //webClient.getOptions().setCssEnabled(false);
        //webClient.getOptions().setThrowExceptionOnScriptError(false);
        //webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
        //webClient.getOptions().setTimeout(1000);
        //webClient.getOptions().setUseInsecureSSL(true);
        //HtmlPage rootPage = null;
        //try {
        //    rootPage = webClient.getPage("https://www.bilibili.com/video/BV1Hk4y1r7D3");
        //} catch (FailingHttpStatusCodeException | IOException e) {
        //    e.printStackTrace();
        //}
        设置一个运行JavaScript的时间
        //webClient.waitForBackgroundJavaScript(1000);
        //assert rootPage != null;
        //String html = rootPage.asXml();
        //Document doc = Jsoup.parse(html);
        //
        //
        //System.out.println(doc.getElementsByClass("ops").html());
    }
}

package com.site.bdata.test;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLContext;
import javax.net.ssl.X509TrustManager;
import java.io.IOException;
import java.security.SecureRandom;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;

public class JsoupSSL {
    public static void main(String[] args) throws IOException {
        Document document = Jsoup.connect("https://www.bilibili.com/video/online.html").validateTLSCertificates(true).get();
        System.out.println(document);
    }

    /**
     * 现在很多站点都是SSL对数据传输进行加密,这也让普通的HttpConnection无法正常的获取该页面的内容,
     * 而Jsoup起初也对此没有做出相应的处理,
     * 想了一下是否可以让Jsoup可以识别所有的SSL加密过的页面,查询了一些资料,发现可以为本地HttpsURLConnection配置一个“万能证书”,其原理是就是:
     * 重置HttpsURLConnection的DefaultHostnameVerifier,使其对任意站点进行验证时都返回true
     * 重置httpsURLConnection的DefaultSSLSocketFactory, 使其生成随机证书
     * 后来Jsoup Connection提供了validateTLSCertificates(boolean validate)//是否进行TLS证书验证,不推荐
     */
    static {
        try {
            // 重置HttpsURLConnection的DefaultHostnameVerifier,使其对任意站点进行验证时都返回true
            HttpsURLConnection.setDefaultHostnameVerifier((hostname, session) -> true);
            // 创建随机证书生成工厂
            SSLContext context = SSLContext.getInstance("TLS");
//            SSLContext context = SSLContext.getInstance("TLSv1.2");
            context.init(null, new X509TrustManager[]{new X509TrustManager() {
                @Override
                public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {
                }

                @Override
                public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {
                }

                @Override
                public X509Certificate[] getAcceptedIssuers() {
                    return new X509Certificate[0];
                }
            }}, new SecureRandom());

            // 重置httpsURLConnection的DefaultSSLSocketFactory, 使其生成随机证书
            HttpsURLConnection.setDefaultSSLSocketFactory(context.getSocketFactory());
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

}

import java.io.IOException;
import java.util.logging.Level;

public class jsouptest {
    public static void main(String[] args) throws IOException {
//        String url = "https://www.toutiao.com/";
        Connection connect = Jsoup.connect(url);
        Document document = connect.get();
        System.out.println(document);
//
//        //构造一个webClient 模拟Chrome 浏览器
//        WebClient webClient = new WebClient(BrowserVersion.CHROME);
屏蔽日志信息
        LogFactory.getLog().setAttribute("org.apache.commons.logging.Log", "org.apache.commons.logging.impl.NoOpLog");
        java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF);
支持JavaScript
//        webClient.getOptions().setJavaScriptEnabled(true);
//        webClient.getOptions().setCssEnabled(false);
//        webClient.getOptions().setActiveXNative(false);
//        webClient.getOptions().setCssEnabled(false);
//        webClient.getOptions().setThrowExceptionOnScriptError(false);
//        webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
//        webClient.getOptions().setTimeout(5000);
//        HtmlPage rootPage = webClient.getPage(url);
//        //设置一个运行JavaScript的时间
//        webClient.waitForBackgroundJavaScript(5000);
//        String html = rootPage.asXml();
//
//        Document document = Jsoup.parse(html);
//        System.out.println(document);

    }
}

package com.site.bdata.util;


import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import java.net.MalformedURLException;
import java.net.URL;

/**
 * @author Programmer Li
 */
public class jsoupUtil {
    //private static String getHtmlPageResponse(String url) throws Exception {
    //    //请求超时时间,默认200秒
    //    int timeout = 9000;
    //    //等待异步JS执行时间,默认200秒
    //    int waitForBackgroundJavaScript = 9000;
    //    String result = "";
    //    final WebClient webClient = new WebClient(BrowserVersion.CHROME);
    //    //当JS执行出错的时候是否抛出异常
    //    webClient.getOptions().setThrowExceptionOnScriptError(false);
    //    //当HTTP的状态非200时是否抛出异常
    //    webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
    //    webClient.getOptions().setActiveXNative(false);
    //    // -----重点-----设置为我们自定义的错误处理类
    //    webClient.setJavaScriptErrorListener(new MyJSErrorListener());
    //
    //    //是否启用CSS
    //    webClient.getOptions().setCssEnabled(false);
    //    //很重要,启用JS
    //    webClient.getOptions().setJavaScriptEnabled(true);
    //    //很重要,设置支持AJAX
    //    webClient.setAjaxController(new NicelyResynchronizingAjaxController());
    //    //设置“浏览器”的请求超时时间
    //    webClient.getOptions().setTimeout(timeout);
    //    //设置JS执行的超时时间
    //    webClient.setJavaScriptTimeout(timeout);
    //    HtmlPage page;
    //    try {
    //        page = webClient.getPage(url);
    //    } catch (Exception e) {
    //        webClient.close();
    //        throw e;
    //    }
    //    //该方法阻塞线程
    //    webClient.waitForBackgroundJavaScript(waitForBackgroundJavaScript);
    //    result = page.asXml();
    //    webClient.close();
    //    return result;
    //}

    //public static Document getHtmlContent(String url){
    //    // 发起请求
    //    String content = null;
    //    try {
    //        content = getHtmlPageResponse(url);
    //    } catch (Exception e) {
    //        e.printStackTrace();
    //    }
    //    // 解析网页 得到文档对象
    //    return Jsoup.parse(content);
    //}


    /**
     * 忽略html unit打印的所有js加载报错信息
     */
    //public static class MyJSErrorListener extends DefaultJavaScriptErrorListener {
    //    @Override
    //    public void scriptException(HtmlPage page, ScriptException scriptException) {
    //    }
    //
    //    @Override
    //    public void timeoutError(HtmlPage page, long allowedTime, long executionTime) {
    //    }
    //
    //    @Override
    //    public void malformedScriptURL(HtmlPage page, String url, MalformedURLException malformedURLException) {
    //
    //    }
    //
    //    @Override
    //    public void loadScriptError(HtmlPage page, URL scriptUrl, Exception exception) {
    //
    //    }
    //
    //    @Override
    //    public void warn(String message, String sourceName, int line, String lineSource, int lineOffset) {
    //
    //    }
    //}

    //public static void main(String[] args) {
    //    Document htmlContent = getHtmlContent("https://www.bilibili.com/video/BV1Vk4y1r7qs");
    //    System.out.println(htmlContent.getElementsByClass("ops").html());
    //
    //}
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值