企业岗位需求决策（一）：数据采集

最新推荐文章于 2024-10-07 10:33:49 发布

11.11.1

最新推荐文章于 2024-10-07 10:33:49 发布

阅读量117

点赞数 2

分类专栏：大数据分析实战文章标签：数据分析

本文链接：https://blog.csdn.net/m0_55885128/article/details/140574254

版权

大数据分析实战专栏收录该内容

12 篇文章 0 订阅

订阅专栏

第1关：webclient+xpath

任务描述
本关任务：编写 Webclient 程序完成对页面数据的采集。

相关知识
为了完成本关任务，你需要掌握：1.如何使用 Webclient，2.如何对页面进行分析获取 Xpath 规则。

package net.educoder;

import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlElement;
import com.gargoylesoftware.htmlunit.html.HtmlPage;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.List;

public class App {
    public static void main(String[] args) throws IOException {
        String content = "";
        WebClient webClient = new WebClient();
        webClient.getOptions().setCssEnabled(false); // 取消 CSS 支持
        webClient.getOptions().setJavaScriptEnabled(false); // 取消 JavaScript支持
        HtmlPage page = webClient.getPage("file:/data/workspace/myshixun/step1/1.html");
        /**
         * 补充xpath规则
         *      比如：page.getByXPath("//tr")
         */

        /*-------------------begin--------------------*/
        List<HtmlElement> byXPath = page.getByXPath("//tr[@class='even'] | //tr[@class='odd']");
        /*-------------------end--------------------*/

        /**
         * 1.遍历byXPath集合
         * 2.获取HtmlElement的文本内容
         *  方法提示  -->  HtmlElement().asText()
         *  为了确保数据的规范，请使用 String().trim() 方法对字符串进行排空（排去空字符串）
         * 3.将获取的文本内容拼接到content字符串内，并使用 "\n"进行字符串隔开
         */
        /*-------------------begin--------------------*/
       for (HtmlElement he:byXPath) {
            
            String text = he.asText().trim()+"\n";
            content += text;
        }


        /*-------------------end--------------------*/
        File file = new File(args[1]);
        OutputStreamWriter oStreamWriter = new OutputStreamWriter(new        FileOutputStream(file), "utf-8");
        oStreamWriter.append(content);
        oStreamWriter.close();
    }
}