htmlunit(二)

htmlunit默认是会对网页中的css,javascript解析的,对于一般的非Js加载页面采集,我们可以把css.javascript解析去掉,这样可以提高效率;但是对于Js加载的页面,就不能去掉了;

import java.io.IOException;
import java.net.MalformedURLException;
 
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
 
public class HtmlUnitTest4 {
 
    public static void main(String[] args) {
        WebClient webClient=new WebClient(BrowserVersion.FIREFOX_52); // 实例化Web客户端 
        webClient.getOptions().setCssEnabled(false); // 取消css支持
        webClient.getOptions().setJavaScriptEnabled(false); // 取消javascript支持
        try {
            HtmlPage page=webClient.getPage("http://www.baidu.com"); // 解析获取页面
            System.out.println("网页html:"+page.asXml()); // 获取Html
            System.out.println("====================");
            System.out.println("网页文本:"+page.asText()); // 获取文本
        } catch (FailingHttpStatusCodeException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (MalformedURLException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }finally{
            webClient.close(); // 关闭客户端,释放内存
        }
    }
}

htmlunit有强大DOM模拟操作功能 包括给Input设置数据,模拟按钮点击等;
以看到form的name input的name htmlunit可以通过name得到dom然后来实现模拟;

import java.io.IOException;
import java.net.MalformedURLException;
 
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlForm;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.html.HtmlSubmitInput;
import com.gargoylesoftware.htmlunit.html.HtmlTextInput;
 
public class HtmlUnitTest5 {
 
    public static void main(String[] args) {
        WebClient webClient=new WebClient(BrowserVersion.FIREFOX_52); // 实例化Web客户端 
        try {
            HtmlPage page=webClient.getPage("http://blog.java1234.com/index.html"); // 解析获取页面
            HtmlForm form=page.getFormByName("myform"); // 得到搜索Form
            HtmlTextInput textField=form.getInputByName("q"); // 获取查询文本框
            HtmlSubmitInput button=form.getInputByName("submitButton"); // 获取提交按钮
            textField.setValueAttribute("java"); // 文本框“填入”数据
            HtmlPage page2=button.click(); // 模拟点击
            System.out.println(page2.asXml());
        } catch (FailingHttpStatusCodeException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (MalformedURLException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }finally{
            webClient.close(); // 关闭客户端,释放内存
        }
    }
}

这里我们给出一个htmlunit实例,爬百度云;

为了更好的体现htmlunit的优势,我们先用httpclient爬下;

import java.io.IOException;
 
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
 
public class BlogCrawler {
 
    public static void main(String[] args) throws ClientProtocolException, IOException {
        CloseableHttpClient httpclient = HttpClients.createDefault(); // 创建httpclient实例
        HttpGet httpget = new HttpGet("https://pan.baidu.com/share/home?uk=305605848#category/type=0"); // 创建httpget实例
        httpget.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:50.0) Gecko/20100101 Firefox/50.0"); // 设置请求头消息User-Agent
        CloseableHttpResponse response = httpclient.execute(httpget); // 执行get请求
        HttpEntity entity=response.getEntity(); // 获取返回实体
        System.out.println("网页内容:"+EntityUtils.toString(entity, "utf-8")); // 指定编码打印网页内容
        response.close(); // 关闭流和释放系统资源
    }
}

我们得到的是这东西,没获取到数据,原因就是百度云的数据加载 是通过ajax加载以及js渲染上去的,所以用httpclient搞不定;

这时候,我们用htmlunit搞下:

import java.io.IOException;
import java.net.MalformedURLException;
 
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
 
public class HtmlUnitTest7 {
 
    public static void main(String[] args){
        WebClient webClient=new WebClient(BrowserVersion.FIREFOX_52);
        HtmlPage page;
        try {
            page = webClient.getPage("https://pan.baidu.com/share/home?uk=305605848#category/type=0");
            Thread.sleep(10000); // 休息10秒钟 等待htmlunit执行js
            System.out.println(page.asXml());
        } catch (FailingHttpStatusCodeException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (MalformedURLException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (InterruptedException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }finally{
            webClient.close(); // 关闭客户端 释放内存
        }
    }
}

htmlunit提供了对table表格的操作支持:

这里先演示下操作简单表格:

<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>简单表格</title>
</head>
<body>
    <table id="table1">
        <tr>
            <th>学号</th>
            <th>姓名</th>
        </tr>
        <tr>
            <td>007</td>
            <td>米歇尔</td>
        </tr>
    </table>
</body>
</html>

地址:http://www.java1234.com/crawler/table01.html
遍历所有行所有列代码:

import java.io.IOException;
import java.net.MalformedURLException;
 
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.html.HtmlTable;
import com.gargoylesoftware.htmlunit.html.HtmlTableCell;
import com.gargoylesoftware.htmlunit.html.HtmlTableRow;
 
public class HtmlUnitTest6 {
 
    public static void main(String[] args) {
        WebClient webClient=new WebClient(BrowserVersion.FIREFOX_52); // 实例化Web客户端 
        webClient.getOptions().setCssEnabled(false); // 取消css支持
        webClient.getOptions().setJavaScriptEnabled(false); // 取消javascript支持
        try {
            HtmlPage page=webClient.getPage("http://www.java1234.com/crawler/table01.html"); // 解析获取页面
            HtmlTable table=page.getHtmlElementById("table1");
            for(HtmlTableRow row:table.getRows()){ // 遍历所有行
                for(HtmlTableCell cell:row.getCells()){  // 遍历所有列
                    System.out.print(cell.asText()+" ");
                }
                System.out.println();
            }
        } catch (FailingHttpStatusCodeException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (MalformedURLException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }finally{
            webClient.close(); // 关闭客户端,释放内存
        }
    }
}

运行输出:

学号 姓名

007 米歇尔

当然也提供了直接获取指定行指定列的api:

import java.io.IOException;
import java.net.MalformedURLException;
 
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.html.HtmlTable;
import com.gargoylesoftware.htmlunit.html.HtmlTableCell;
import com.gargoylesoftware.htmlunit.html.HtmlTableRow;
 
public class HtmlUnitTest6 {
 
public static void main(String[] args) {
    WebClient webClient=new WebClient(BrowserVersion.FIREFOX_52); // 实例化Web客户端 
    webClient.getOptions().setCssEnabled(false); // 取消css支持
    webClient.getOptions().setJavaScriptEnabled(false); // 取消javascript支持
    try {
        HtmlPage page=webClient.getPage("http://www.java1234.com/crawler/table01.html"); // 解析获取页面
        HtmlTable table=page.getHtmlElementById("table1");
        System.out.println("获取第1行第2列:"+table.getCellAt(0, 1).asText());
        System.out.println("获取第2行第1列:"+table.getCellAt(1, 0).asText());
    } catch (FailingHttpStatusCodeException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (MalformedURLException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }finally{
        webClient.close(); // 关闭客户端,释放内存
    }
    }
}

运行输出:

获取第1行第2列:姓名

获取第2行第1列:007

有时候表格比较复杂 有caption header body footer

htmlunit同样提供了api支持:

<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>复杂表格</title>
</head>
<body>
     <table id="table1">
        <caption>复杂表格</caption>
        <thead>
            <tr>
                <th>个数</th>
                <th>名称</th>
            </tr>
        </thead>
        <tfoot>
            <tr>
                <td>7</td>
                <td></td>
            </tr>
        </tfoot>
        <tbody>
            <tr>
                <td>5</td>
                <td>猪</td>
            </tr>
        </tbody>
        <tbody>
            <tr>
                <td>2</td>
                <td>牛</td>
            </tr>
        </tbody>
    </table>
</body>
</html>
<!DOCTYPE html>
复杂表格
复杂表格
个数名称
7
5
2

这个表格比前面一个复杂点:

测试地址:http://www.java1234.com/crawler/table02.html

我们给下测试代码:

import java.io.IOException;
import java.net.MalformedURLException;
import java.util.List;
 
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.html.HtmlTable;
import com.gargoylesoftware.htmlunit.html.HtmlTableBody;
import com.gargoylesoftware.htmlunit.html.HtmlTableFooter;
import com.gargoylesoftware.htmlunit.html.HtmlTableHeader;
import com.gargoylesoftware.htmlunit.html.HtmlTableRow;
 
public class HtmlUnitTest6 {
 
    public static void main(String[] args) {
        WebClient webClient=new WebClient(BrowserVersion.FIREFOX_52); // 实例化Web客户端 
        webClient.getOptions().setCssEnabled(false); // 取消css支持
        webClient.getOptions().setJavaScriptEnabled(false); // 取消javascript支持
        try {
            HtmlPage page=webClient.getPage("http://www.java1234.com/crawler/table02.html"); // 解析获取页面
            HtmlTable table=page.getHtmlElementById("table1");
            String caption=table.getCaptionText(); // 获取表格标题
            System.out.println("表格标题:"+caption);
             
            HtmlTableHeader header=table.getHeader(); // 获取表头信息
            List<HtmlTableRow> headerRows=header.getRows(); // 获取所有头行
            System.out.println("头信息:");
            for(HtmlTableRow row:headerRows){
                System.out.println(row.asText());
            }
             
            for(HtmlTableBody body:table.getBodies()){ // 获取表格内容信息
                List<HtmlTableRow> rows=body.getRows();
                for(HtmlTableRow row:rows){
                    System.out.println(row.asText());
                }
            }
             
            HtmlTableFooter footer=table.getFooter(); // 获取根信息
            List<HtmlTableRow> footerRows=footer.getRows();
            System.out.println("根信息:");
            for(HtmlTableRow row:footerRows){
                System.out.println(row.asText());
            }
        } catch (FailingHttpStatusCodeException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (MalformedURLException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }finally{
            webClient.close(); // 关闭客户端,释放内存
        }
    }
}

运行输出:

表格标题:复杂表格

头信息:

个数 名称

5 猪

2 牛

根信息:

7

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值