java方式httpclient/htmlunit爬虫入门案例

@Test
public void test1() throws Exception{

    CloseableHttpClient httpClient = HttpClients.createDefault();
    
    HttpGet httpGet = new HttpGet("http://www.baidu.com");
    CloseableHttpResponse response = httpClient.execute(httpGet);
    if (response != null){
        HttpEntity entity =  response.getEntity();  
        String result = EntityUtils.toString(entity, "UTF-8");
        System.out.println("网页内容:"+result);
    }
    if (response != null){
        response.close();
    }
    if (httpClient != null){
        httpClient.close();
    }


}

@Test
public void test2() throws  Exception{
    final WebClient client = new WebClient();

    client.getOptions().setJavaScriptEnabled(true); 
    client.getOptions().setUseInsecureSSL(true);
    client.getOptions().setCssEnabled(false);
    client.getOptions().setThrowExceptionOnScriptError(false);
    client.getOptions().setThrowExceptionOnFailingStatusCode(false);
    client.setAjaxController(new NicelyResynchronizingAjaxController());
    HtmlPage page =(HtmlPage)  client.getPage("http://www.baidu.com");
    System.out.print(page.asText());
}
@Test
public void test3() throws  Exception{
//htmlunit不是专门做爬虫的,推荐用selenium+phantomjs无头浏览器去做,通过xpath的方式爬虫更加灵活
    final WebClient client = new WebClient();

    client.getOptions().setJavaScriptEnabled(true); S
    client.getOptions().setUseInsecureSSL(true);
    client.getOptions().setCssEnabled(false);
    client.getOptions().setThrowExceptionOnScriptError(false);
    client.getOptions().setThrowExceptionOnFailingStatusCode(false);

    HtmlPage page =(HtmlPage)  client.getPage("http://www.baidu.com");

    final HtmlForm form = page.getFormByName("f");

    final HtmlSubmitInput button = form.getInputByValue("百度一下");

    final HtmlTextInput textField = form.getInputByName("wd");

    textField.setValueAttribute("山炮");
  
    final HtmlPage nextPage = button.click();

    String result = nextPage.asText();

    System.out.print(result);



}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值