网络爬虫--HtmlUnit

1、简介
htmlunit 是一款开源的java 页面分析工具,读取页面后,可以有效的使用htmlunit分析页面上的内容。项目可以模拟浏览器运行,被誉为java浏览器的开源实现。是一个没有界面的浏览器,运行速度迅速。是junit的扩展之一
2、官方API文档
HtmlUnit官网API文档
3、pom文件

<dependency>
            <groupId>net.sourceforge.htmlunit</groupId>
            <artifactId>htmlunit</artifactId>
            <version>2.26</version>
        </dependency>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.12</version>
        </dependency>
        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-log4j12</artifactId>
            <version>1.7.21</version>
        </dependency>

4、测试代码

package com.sun.htmlunit;

import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.DomElement;
import com.gargoylesoftware.htmlunit.html.DomNodeList;
import com.gargoylesoftware.htmlunit.html.HtmlDivision;
import com.gargoylesoftware.htmlunit.html.HtmlForm;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.html.HtmlSubmitInput;
import com.gargoylesoftware.htmlunit.html.HtmlTextInput;

/**
 * htmlunit测试
 * @author sunt
 * @dade 2017年4月17日上午11:04:22
 * @version v1.0
 */
public class HtmlUnitTestDemo {

    /**
     * 日志
     */
    private static Logger logger = LoggerFactory.getLogger(HtmlUnitTestDemo.class);

    /**
     * htmlunit入门测试
     */
    @Test
    public void test1() {
        //实例化web客户端
        WebClient client = new WebClient();
        try {
            //解析请求url页面
            HtmlPage page = client.getPage("http://blog.csdn.net/u010427935");
            logger.info("===========>获取请求页面的html:" + page.asXml());

            logger.info("=============>获取请求页面内容:" + page.asText());
        } catch (Exception e) {
            logger.error("=====HtmlUnitTestDemo===test1===>" + e.getMessage());
        }finally{
            //关闭客户端、释放内存
            client.close();
        }
    }

    /**
     * 模拟浏览器的请求,解决部分网址拒绝访问
     */
    @Test
    public void test2() {
        //初始化web客户端
        WebClient client = new WebClient(BrowserVersion.FIREFOX_52);

        try {
            HtmlPage page = client.getPage("http://mvnrepository.com");
            logger.debug("=====================>获取请求页面的html内容:" + page.asXml());

        } catch (Exception e) {
            logger.error("=======test2==========>" + e.getMessage());
        }finally{
            //关闭客户端释放内存
            client.close();
        }
    }


    /**
     * 获取指定元素
     */
    @Test
    public void test3() {
        //初始化web客户端
        WebClient client = new WebClient(BrowserVersion.FIREFOX_52);
        try {
            HtmlPage page = client.getPage("http://blog.csdn.net/u010427935");
            //获取指定id的html内容
            HtmlDivision division = page.getHtmlElementById("navMenu");
            logger.debug("================>指定html的内容:" + division.asXml());

            //通过name获取html内容
            DomNodeList<DomElement> tagList = page.getElementsByTagName("a");

            for (DomElement domElement : tagList) {
                logger.debug("===========>byTagName:" + domElement.asXml());
            }

        } catch (Exception e) {
            logger.error("=====test3===========>" + e.getMessage());
        }finally{
            //关闭客户端释放内存
            client.close();
        }
    }


    /**
     * 模拟点击事件实现搜索功能
     */
    @Test
    public void test4() {
        //实例化web客户端
        WebClient client = new WebClient(BrowserVersion.FIREFOX_52);

        try {
            //获取解析的页面
            HtmlPage page = client.getPage("http://blog.java1234.com/index.html");
            //获取提交的表单
            HtmlForm form = page.getFormByName("myform");
            //获取输入框
            HtmlTextInput input = form.getInputByName("q");

            //获取提交的按钮
            HtmlSubmitInput submitInput = form.getInputByName("submitButton");

            for (int i = 0; i < 1000; i++) {
                //设置输入框的值
                input.setValueAttribute("java" + i);
                //模拟点击、提交表单
                HtmlPage result = submitInput.click();
                logger.info("===========>搜索的结果:" + result.asXml());
            }


        } catch (Exception e) {
            logger.error("===========test4========>" + e.getMessage());
        }finally{
            //关闭客户端释放内存
            client.close();
        }
    }

    /**
     * 代理IP
     */
    @Test
    public void test5() {
        //初始化客户端
        WebClient client = new WebClient(BrowserVersion.FIREFOX_52, "58.118.185.100", 8998);

        try {
            HtmlPage page = client.getPage("http://blog.csdn.net/u010427935");
            logger.debug("=======html内容:====>" + page.asXml());
        } catch (Exception e) {
            logger.error("===========test5=========>" + e.getMessage());
        }finally{
            client.close();
        }
    }

    /**
     * 对于非js加载的页面取消js和css的解析
     */
    @Test
    public void test6() {
        //实例化客户端
        WebClient client = new WebClient(BrowserVersion.FIREFOX_52);
        //取消客户端对js和css的解析
        client.getOptions().setCssEnabled(false);
        client.getOptions().setJavaScriptEnabled(false);

        try {
            HtmlPage page = client.getPage("http://blog.csdn.net/u010427935");
            logger.debug("==============>获取的html内容:" + page.asXml());

        } catch (Exception e) {
            logger.error("========error========test6====>" + e.getMessage());
        }finally{
            client.close();
        }
    }


    /**
     * 爬取ajax加载的页面httpclient无法抓去到数据
     */
    @Test
    public void test7() {
        WebClient client = new WebClient(BrowserVersion.FIREFOX_52);
        HtmlPage page = null;
        try {
            page = client.getPage("https://pan.baidu.com/share/home?uk=305605848#category/type=0");
            //线程休眠等待js加载
            Thread.sleep(10000);

            logger.info("==========>抓去到的html内容:" + page.asXml());
        } catch (Exception e) {
            logger.error("=====error===test7=====>" + e.getMessage());
        }finally{
            client.close();
        }
    }
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值