jsoup爬知乎并导出到excel

第一步 创建maven项目

第二步 导入excel和springboot的依赖

<dependencies>
    <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
    <dependency>
        <groupId>org.jsoup</groupId>
        <artifactId>jsoup</artifactId>
        <version>1.11.3</version>
    </dependency>
    <dependency>
        <groupId>org.projectlombok</groupId>
        <artifactId>lombok</artifactId>
        <version>1.16.10</version>
    </dependency>
    <dependency>
    <groupId>org.springframework.boot</groupId>
    <artifactId>spring-boot-starter-test</artifactId>
    </dependency>
    <!--excel的poi的依赖-->
    <dependency>
        <groupId>org.apache.poi</groupId>
        <artifactId>poi</artifactId>
        <version>3.10-FINAL</version>
    </dependency>
    <dependency>
        <groupId>org.apache.poi</groupId>
        <artifactId>poi-ooxml</artifactId>
        <version>3.10-FINAL</version>
    </dependency>
</dependencies>
    <parent>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-parent</artifactId>
        <version>2.0.3.RELEASE</version>
    </parent>

 

第三步 在单元测试类 写两个方法代码如下:

public class TestReptilian {
    //#root > div > main > div > div > div > div.SignFlowHeader
    @Test
    public void test() throws IOException {
        //
        Document document = Jsoup.connect("https://www.zhihu.com/explore/recommendations").userAgent("Mozilla").get();
        Element main = document.getElementById("zh-recommend-list-full");
        Elements url = main.select("div").select("div:nth-child(2)")
                .select("h2").select("a[class=question_link]");
        for (Element question : url) {
            //输出href后的值,即主页上每个关注问题的链接
            String URL = question.attr("abs:href");
            //下载问题链接指向的页面
            Document document2 = Jsoup.connect(URL)
                    .userAgent("Mozilla")
                    .get();
            //问题 #root > div > main > div > div:nth-child(11) > div.QuestionHeader > div.QuestionHeader-content > div.QuestionHeader-main > h1
            Elements title = document2.select("#root")
                    .select("div")
                    .select("main")
                    .select("div")
                    .select("div:nth-child(11)")
                    .select("div.QuestionHeader")
                    .select("div.QuestionHeader-content")
                    .select("div.QuestionHeader-main")
                    .select("h1");
            //问题描述 #root > div > main > div > div:nth-child(11) > div.QuestionHeader > div.QuestionHeader-content > div.QuestionHeader-main > div:nth-child(3) > div > div > div > span
            Elements detail = document2.select("#root")
                    .select("div")
                    .select("main")
                    .select("div")
                    .select("div:nth-child(11)")
                    .select("div.QuestionHeader")
                    .select("div.QuestionHeader-content")
                    .select("div.QuestionHeader-main")
                    .select("div:nth-child(3)")
                    .select("div")
                    .select("div")
                    .select("div")
                    .select("span");
            //回答
            Elements answer = document2.select("#root")
                    .select("div")
                    .select("main")
                    .select("div")
                    .select("div.Question-main")
                    .select("div.Question-mainColumn")
                    .select("div.Card.AnswerCard")
                    .select("div")
                    .select("div")
                    .select("div.RichContent.RichContent--unescapable")
                    .select("div.RichContent-inner")
                    .select("span");
            System.out.println("\n" + "链接:" + URL
                    + "\n" + "标题:" + title.text()
                    + "\n" + "问题描述:" + detail.text()
                    + "\n" + "回答:" + answer.text());
            writeExcel(URL,title.text(),detail.text(),answer.text());
        }
    }

    public void writeExcel(String url, String title, String detail, String answer) {
        Workbook workbook = new XSSFWorkbook();
        Sheet sheet = workbook.createSheet("0");
        Row row = sheet.createRow(0);
        CellStyle cellStyle = workbook.createCellStyle();
        // 设置这些样式
        cellStyle.setFillForegroundColor(HSSFColor.SKY_BLUE.index);
        cellStyle.setFillPattern(CellStyle.SOLID_FOREGROUND);
        cellStyle.setBorderBottom(CellStyle.BORDER_THIN);
        cellStyle.setBorderLeft(CellStyle.BORDER_THIN);
        cellStyle.setBorderRight(CellStyle.BORDER_THIN);
        cellStyle.setBorderTop(CellStyle.BORDER_THIN);
        cellStyle.setAlignment(CellStyle.ALIGN_CENTER);

        row.createCell(0).setCellStyle(cellStyle);
        row.createCell(0).setCellValue(url);

        row.createCell(1).setCellStyle(cellStyle);
        row.createCell(1).setCellValue(title);

        row.createCell(2).setCellStyle(cellStyle);
        row.createCell(2).setCellValue(detail);

        row.createCell(3).setCellStyle(cellStyle);
        row.createCell(3).setCellValue(answer);

        workbook.setSheetName(0, "信息");
        try {
            File file = new File("D:/crub/zhihu.xlsx");
            FileOutputStream fileoutputStream = new FileOutputStream(file);
            workbook.write(fileoutputStream);
            fileoutputStream.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值