Java调用Jsoup解析网页

最新推荐文章于 2024-05-15 15:36:46 发布

ichuany

最新推荐文章于 2024-05-15 15:36:46 发布

阅读量435

点赞数

分类专栏： Jsoup 文章标签： java 开发语言

本文链接：https://blog.csdn.net/ichuany/article/details/125600530

版权

Jsoup 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

1. maven引入jar包

<dependency>
  <groupId>org.jsoup</groupId>
  <artifactId>jsoup</artifactId>
  <version>1.10.2</version>
</dependency>

2. 解析代码示例

public ServerResponse getInfo(String str) throws IOException
    {
        // 
        String url = "https://www.xxxxxxxx.com/xxxxxxxx?q="+str;
        //Document document = Jsoup.parse(new URL(url),30000);
        // 基本信息
        Connection conn = Jsoup.connect(url).timeout(5000);
        conn.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
        conn.header("Accept-Encoding", "gzip, deflate, sdch");
        conn.header("Accept-Language", "zh-CN,zh;q=0.8");
        conn.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36");
        Document document = conn.get();
        Elements bookNames = document.getElementsByClass("title");
        Element book = bookNames.first();
        String name = book.getElementsByTag("a").html();
        String newUrl = book.select("a").attr("href");
        System.out.println(name);
        System.out.println(newUrl);

        String splitUrl = newUrl.substring(newUrl.indexOf("url=")+4,newUrl.indexOf("&query="));
        splitUrl=splitUrl.replace("%2F","/").replace("%3A",":");
        System.out.println(splitUrl);

        // 详细信息
        Connection connlink = Jsoup.connect(splitUrl).timeout(5000);
        connlink.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
        connlink.header("Accept-Encoding", "gzip, deflate, sdch");
        connlink.header("Accept-Language", "zh-CN,zh;q=0.8");
        connlink.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36");
        Document documentLink = connlink.get();
        Elements article = documentLink.getElementsByClass("article");
        Element detailInfo = article.first();
        String imgSrc = detailInfo.getElementById("mainpic").select("img").attr("src");
        Element baseInfo = detailInfo.getElementById("info");
        // String cbs = baseInfo.select("a").text();
        String intro = detailInfo.getElementsByClass("intro").get(0).select("p").html(); // 书籍简介
        String authorDetail = detailInfo.getElementsByClass("intro").get(1).select("p").html(); // 作者简介
        String infoDetail = baseInfo.toString().replaceAll("</?[^>]+>", "");// 去掉字符串中的html信息 
        Map<String,String> result = analyseBookInfo(infoDetail);// 获取明细信息
        // 返回结果
        // Map<String,String> result = new HashMap<>();
        result.put("bookName",name);
        // result.put("info",getText(baseInfo.toString()));
        result.put("info",infoDetail);
        result.put("isbn",isbn);
        result.put("imgSrc","https://images.weserv.nl/?url="+imgSrc);
        //result.put("cbs",cbs);
        result.put("intro",intro);
        result.put("authorDetail",authorDetail);
        return ServerResponse.createBySuccess(result);
    }

    public Map<String,String> analyseBookInfo(String oldStr){
        Map<String,String> resultMap = new HashMap<>();
        if(!StringUtils.isEmpty(oldStr)){
            oldStr = oldStr.replace(" ","");
            String[] infoSplit = oldStr.split("\n");
            System.out.println(infoSplit[0]);
            for(int i=0;i<infoSplit.length;i++){
                if(infoSplit[i].indexOf(":") > 0){
                    if("文本1".equals(filedSplit[0])){
                        resultMap.put("page",filedSplit[1]);
                    }
                }
            }
        }
        return resultMap;
    }