maven httpclient jsoup爬虫入门

import com.crawl.getdemo;
import com.crawl.getdemo2;


import java.io.IOException;

public class SimpleClient {
    public static void main(String[] args) throws IOException {
        getdemo2 getdemo2 = new getdemo2();
        getdemo2.testHttpClientA();
//        CloseableHttpClient httpClient = HttpClients.createDefault();
//        // 设置代理服务器地址和端口
//        //client.getHostConfiguration().setProxy("proxy_host_addr",proxy_port);
//        // 使用 GET 方法 ,如果服务器需要通过 HTTPS 连接,那只需要将下面 URL 中的 http 换成 https
//        HttpMethod method=new GetMethod("http://java.sun.com");
//        //使用POST方法
//        //HttpMethod method = new PostMethod("http://java.sun.com");
//        client.executeMethod(method);
//
//        //打印服务器返回的状态
//        System.out.println(method.getStatusLine());
//        //打印返回的信息
//        System.out.println(method.getResponseBodyAsString());
//        //释放连接
//        method.releaseConnection();
//    }
    }
}


getdemo1方法(入门)

获取页面信息

package com.crawl;

import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair;
import org.apache.http.ParseException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;

import java.io.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.zip.GZIPInputStream;

public class getdemo {

    public void testHttpClientA() throws IOException {
        //使用默认配置的httpclient
        CloseableHttpClient httpClient = HttpClients.createDefault();
        //即将访问的url
        String url = "http://www.biquge.com.tw/17_17380/";
        //get形式的访问
        HttpGet httpGet = new HttpGet(url);

        //执行请求
        CloseableHttpResponse response = null;
        try {
            response = httpClient.execute(httpGet);
            //打印请求的状态码  请求成功为200
            System.out.println(response.getStatusLine().getStatusCode());
            //打印请求的实体内容 返回json格式
            HttpEntity entity = response.getEntity();
            //获取所有头信息
            Header[] allHeaders = response.getAllHeaders();
            for (Header allHeader : allHeaders) {
                System.out.println(allHeader.getName());
                System.out.println(allHeader.getValue());
                System.out.println(allHeader.toString());
            }

//            方法一 官方不推荐
//            if (entity != null) {
//                //输出更详细的抓取内容(html格式)
//                System.out.println(EntityUtils.toString(entity, "GBK"));
              System.out.println(EntityUtils.toString(entity, "utf-8"));
//            }
//            释放资源
//            EntityUtils.consume(entity);
            //方法二 官方推荐 使用流的形式处理请求结果
            if (entity != null) {
                InputStream content = entity.getContent();
                BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(content,"GBK"));
                String line = "";
                while ((line = bufferedReader.readLine()) != null) {
                    System.out.println(line);
                }
                bufferedReader.close();
            }
//            GZIPInputStream gzip = null;
//            if (entity != null) {
//                InputStream content = entity.getContent();
                gzip = new GZIPInputStream(content);
//                BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(content,"GBK"));
//                String line = "";
//                while ((line = bufferedReader.readLine()) != null) {
//                    System.out.println(line);
//                }
//                bufferedReader.close();
//            }

        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            response.close();
        }
    }
}



getdemo2方法

初步处理页面将标签省略

package com.crawl;

import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair;
import org.apache.http.ParseException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.*;

import static java.awt.SystemColor.info;

public class getdemo2 {

    public void testHttpClientA() throws IOException {
        //使用默认配置的httpclient
        CloseableHttpClient httpClient = HttpClients.createDefault();
        //即将访问的url
        String url = "http://www.biquge.com.tw/17_17380/";
        //get形式的访问
        HttpGet httpGet = new HttpGet(url);

        //执行请求
        CloseableHttpResponse response = null;
        try {

            response = httpClient.execute(httpGet);
            //打印请求的状态码  请求成功为200
//            System.out.println(response.getStatusLine().getStatusCode());
            //打印请求的实体内容 返回json格式
            HttpEntity entity = response.getEntity();
            //获取所有头信息
//            Header[] allHeaders = response.getAllHeaders();
//            for (Header allHeader : allHeaders) {
//                System.out.println(allHeader.getName());
//                System.out.println(allHeader.getValue());
//                System.out.println(allHeader.toString());
//            }
            String content=EntityUtils.toString(entity, "GBK");
            Document doc=Jsoup.parse(content); // 解析网页 得到文档对象

            Elements h1Elements=doc.getElementsByTag("h1"); // 根据tag名称来查询DOM
            Element h1Element=h1Elements.get(0);
            String h1=h1Element.text();
            System.out.println("题目:"+h1);

            Elements authorElements=doc.select( "#info p" );//作者
            Element authorElement=authorElements.get(0);
            String author=authorElement.text();
            System.out.println(author);


            Element introElement=doc.getElementById("intro");// 简介
            String intro=introElement.text();
            System.out.println("简介"+intro);

            System.out.println("章节目录");//目录
            Elements hrefElements=doc.select("#list dl dd a");
            for(Element e:hrefElements){
                System.out.println(e.toString());
            }
            System.out.println(hrefElements.size());

        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            response.close();
        }
    }
}




出现问题及解决

页面中文乱码问题

解决:应使用GBK而非utf-8,

关联代码

System.out.println(EntityUtils.toString(entity, "GBK"));
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(content,"GBK"));
    String content=EntityUtils.toString(entity, "GBK");


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值