URLConnection实现爬虫(解决重定向、设置cookie才能抓取页面等问题)

1.关键方法

    /**
     * 向指定 URL 发送POST方法的请求
     * 
     * @param url
     *            发送请求的 URL
     * @param param
     *            请求参数,请求参数应该是 name1=value1&name2=value2 的形式。
     * @param encode
     *            请求页面的字符编码
     * @param cookie
     *            cookie
     * @return 所代表远程资源的响应结果
     */
    public static String sendPost1(String url, String param, String encode,String cookie) {
        PrintWriter out = null;
        BufferedReader in = null;
        String result = "";
        try {
            URL realUrl = new URL(url);
            // 打开和URL之间的连接
            URLConnection conn = realUrl.openConnection();
            // 设置通用的请求属性
            conn.setRequestProperty("accept", "*/*");
            conn.setRequestProperty("Accept-Language","zh-CN,zh;q=0.8");
            conn.setRequestProperty("Cache-Control","max-age=0");
            conn.setRequestProperty("connection", "Keep-Alive");
            conn.setRequestProperty("Cookie",cookie);
            //conn.setRequestProperty("Host","www.zjtax.gov.cn");
            conn.setRequestProperty("user-agent",
                    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
            // 发送POST请求必须设置如下两行
            conn.setDoOutput(true);
            conn.setDoInput(true);
            // 获取URLConnection对象对应的输出流
            out = new PrintWriter(conn.getOutputStream());
            // 发送请求参数
            out.print(param);
            // flush输出流的缓冲
            out.flush();
            // 定义BufferedReader输入流来读取URL的响应
            in = new BufferedReader(
                    new InputStreamReader(conn.getInputStream(),encode));
            String line;
            while ((line = in.readLine()) != null) {
                result += line;
            }
        } catch (Exception e) {
            System.out.println("发送 POST 请求出现异常!"+e);
            e.printStackTrace();
        }
        //使用finally块来关闭输出流、输入流
        finally{
            try{
                if(out!=null){
                    out.close();
                }
                if(in!=null){
                    in.close();
                }
            }
            catch(IOException ex){
                ex.printStackTrace();
            }
        }
        return result;
    } 
    /**
     * 获取cookie
     * 
     * @param url
     *            发送请求的URL
     * @return key=value;key=value;...
     */
    public static String getCookie2(String url) {  
        HttpURLConnection conn = null;  
        try {
            URL realUrl = new URL(url);
            conn = (HttpURLConnection) realUrl.openConnection();
            conn.setRequestProperty("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
            conn.setRequestProperty("Accept-Encoding","gzip, deflate, sdch");
            conn.setRequestProperty("Accept-Language","zh-CN,zh;q=0.8");
            conn.setRequestProperty("Cache-Control","max-age=0");
            conn.setRequestProperty("connection", "Keep-Alive");
            //conn.setRequestProperty("Host","www.zjtax.gov.cn");
            conn.setRequestProperty("user-agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
            //是否自动执行 http 重定向,默认为true
            //如果实际操作中,不存在重定向问题,不需要设置此行。
            conn.setInstanceFollowRedirects(false);

            conn.setDoInput(true);
            conn.setDoOutput(true);
            conn.setRequestMethod("POST");
        } catch (Exception e) {
            e.printStackTrace();
        }
        String sessionId = "";
        String cookieVal = "";  
        String key = null;  

//      Map<String,List<String>> map = conn.getHeaderFields();
//      for (String key1 : map.keySet()) {
//          System.out.println(key1 + "--->" + map.get(key1));
//      }
        //取cookie  
        for(int i = 1; (key = conn.getHeaderFieldKey(i)) != null; i++){  
            if(key.equalsIgnoreCase("set-cookie")){  
                cookieVal = conn.getHeaderField(i);  
                cookieVal = cookieVal.substring(0, cookieVal.indexOf(";"));  
                sessionId = sessionId + cookieVal + ";";  
            }  
        }  
        //如果实际操作中,不存在重定向问题,不需要以下四行
        String location= conn.getHeaderField("Location");//获取 重定向地址
        List<String> list = getCookie3(location,sessionId);
        List<String> list2 = getCookie3(list.get(1),sessionId+list.get(0));
        sessionId = sessionId + list2.get(0);

        return sessionId;  
    }  
    /**
     * 获取 cookie
     * @param url
     *          发送请求的URL
     * @param cookie
     *          cookie             
     */
    public static List<String> getCookie3(String url,String cookie) {  
        HttpURLConnection conn = null;  
        try {
            URL realUrl = new URL(url);
            conn = (HttpURLConnection) realUrl.openConnection();
            conn.setRequestProperty("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
            conn.setRequestProperty("Accept-Encoding","gzip, deflate, sdch");
            conn.setRequestProperty("Accept-Language","zh-CN,zh;q=0.8");
            conn.setRequestProperty("Cache-Control","max-age=0");
            conn.setRequestProperty("connection", "Keep-Alive");
            //conn.setRequestProperty("Host","www.zjtax.gov.cn");
            conn.setRequestProperty("user-agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
            conn.setRequestProperty("Cookie",cookie);

            conn.setInstanceFollowRedirects(false);
            conn.setDoInput(true);
            conn.setDoOutput(true);
            conn.setRequestMethod("POST");
        } catch (Exception e) {
            e.printStackTrace();
        }
        String sessionId = "";  
        String cookieVal = "";  
        String key = null;  
        String location= conn.getHeaderField("Location");

        for(int i = 1; (key = conn.getHeaderFieldKey(i)) != null; i++){  
            if(key.equalsIgnoreCase("set-cookie")){  
                cookieVal = conn.getHeaderField(i);  
                cookieVal = cookieVal.substring(0, cookieVal.indexOf(";"));  
                sessionId = sessionId + cookieVal + ";";  
            }  
        }
        List<String> list = new ArrayList<String>();
        list.add(sessionId);//存放cookie
        list.add(location);//存放重定向地址
        return list;
    }  

另附,最基本的get抓取、post抓取、获取cookie方法

public class HttpURLContent {
    /**
     * 向指定URL发送GET方法的请求
     * 
     * @param url
     *            发送请求的URL
     * @param param
     *            请求参数,请求参数应该是 name1=value1&name2=value2 的形式。
     * @return URL 所代表远程资源的响应结果
     */
    public static String sendGet(String url, String param) {
        String result = "";
        BufferedReader in = null;
        try {
            String urlNameString = url + "?" + param;
            URL realUrl = new URL(urlNameString);
            // 打开和URL之间的连接
            URLConnection connection = realUrl.openConnection();
            // 设置通用的请求属性
            connection.setRequestProperty("accept", "*/*");
            connection.setRequestProperty("connection", "Keep-Alive");
            connection.setRequestProperty("user-agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
            // 建立实际的连接
            connection.connect();
            // 定义 BufferedReader输入流来读取URL的响应
            in = new BufferedReader(new InputStreamReader(
                    connection.getInputStream()));
            String line;
            while ((line = in.readLine()) != null) {
                result += line;
            }
        } catch (Exception e) {
            System.out.println("发送GET请求出现异常!" + e);
            e.printStackTrace();
        }
        // 使用finally块来关闭输入流
        finally {
            try {
                if (in != null) {
                    in.close();
                }
            } catch (Exception e2) {
                e2.printStackTrace();
            }
        }
        return result;
    }

    /**
     * 向指定 URL 发送POST方法的请求
     * 
     * @param url
     *            发送请求的 URL
     * @param param
     *            请求参数,请求参数应该是 name1=value1&name2=value2 的形式。
     * @return 所代表远程资源的响应结果
     */
    public static String sendPost(String url, String param) {
        PrintWriter out = null;
        BufferedReader in = null;
        String result = "";
        try {
            URL realUrl = new URL(url);
            // 打开和URL之间的连接
            URLConnection conn = realUrl.openConnection();
            // 设置通用的请求属性
            conn.setRequestProperty("accept", "*/*");
            conn.setRequestProperty("connection", "Keep-Alive");
            conn.setRequestProperty("user-agent",
                    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
            // 发送POST请求必须设置如下两行
            conn.setDoOutput(true);
            conn.setDoInput(true);
            // 获取URLConnection对象对应的输出流
            out = new PrintWriter(conn.getOutputStream());
            // 发送请求参数
            out.print(param);
            // flush输出流的缓冲
            out.flush();
            // 定义BufferedReader输入流来读取URL的响应
            in = new BufferedReader(
                    new InputStreamReader(conn.getInputStream()));
            String line;
            while ((line = in.readLine()) != null) {
                result += line;
            }
        } catch (Exception e) {
            System.out.println("发送 POST 请求出现异常!"+e);
            e.printStackTrace();
        }
        //使用finally块来关闭输出流、输入流
        finally{
            try{
                if(out!=null){
                    out.close();
                }
                if(in!=null){
                    in.close();
                }
            }
            catch(IOException ex){
                ex.printStackTrace();
            }
        }
        return result;
    }   
    public static String getCookie(String url) {  
        HttpURLConnection conn = null;  
        try {
            URL realUrl = new URL(url);
            conn = (HttpURLConnection) realUrl.openConnection();
            conn.setDoInput(true);
            conn.setDoOutput(true);
            conn.setRequestMethod("POST");
        } catch (Exception e) {
            e.printStackTrace();
        }
        String sessionId = "";  
        String cookieVal = "";  
        String key = null;  
        //取cookie  
        for(int i = 1; (key = conn.getHeaderFieldKey(i)) != null; i++){  
            if(key.equalsIgnoreCase("set-cookie")){  
                cookieVal = conn.getHeaderField(i);  
                cookieVal = cookieVal.substring(0, cookieVal.indexOf(";"));  
                sessionId = sessionId + cookieVal + ";";  
            }  
        }  
        return sessionId;  
    }  

2.问题总结
第一步:使用最基本方法,直接抓取,抓取到内容,恭喜你。

第二步:直接抓取页面无果时,通过设置cookie抓取,即conn.setRequestProperty(“Cookie”,cookie);

第三步:新的问题是,如何获取cookie,当第一次访问页面时会产生cookie。所以要先访问一次页面,拿到cookie。即getCookie(String url)方法

第四步:这里就比较复杂了,我接触的大部分页面抓取,目标页面不存在重定向。如果遇到,就需要使用getCookie2()和getCookie3()方法 获取cookie。

这也是我目前遇到最麻烦的抓取,用了二天才解决。加油加油加油!!!

3.测试代码

    /**
     * 出口退税率查询
     * 测试url:
     * http://www.zjtax.gov.cn/wcm/xchaxun/tuishui.jsp?sotype=FULLNAME&sovalue=钢铁&PageIndex=1 
     */
    public HashMap<String,Object> getCktsls(String url){
        //先获取cookie
        String cookie= HttpURLContent.getCookie2("http://www.zjtax.gov.cn/wcm/xchaxun/tuishui.jsp");
        HashMap<String,Object> re = new HashMap<String,Object>();
        //抓取结果
        String result = HttpURLContent.sendPost1(url,null,"utf-8",cookie);
        //System.out.println(result);

        //以下代码是对结果的处理了。。。根据实际情况。。。
        if(result.contains("<font color='#104194'>共")){//查询到结果
            //总页数
            String[] result_arr = result.split("<font color='#104194'>共");
            String totalPage_str = result_arr[1].substring(0, result_arr[1].indexOf("页")).trim();
            List<Map<String,String>> mapList = new ArrayList<Map<String,String>>();
            String[] result_arr1 = result.split("class=\"gs_cx4_sp7\">");

            for(int i=1;i<result_arr1.length;i++){
                Map<String,String> map = new HashMap<String,String>();
                map.put("number", result_arr1[i].substring(0, result_arr1[i].indexOf("</span>")));

                String[] result_arr2 = result_arr1[i].split("\">");
                for(int j=1;j<result_arr2.length;j++){
                    String value = "";
                    if(j<=5) value = result_arr2[j].substring(0, result_arr2[j].indexOf("</span>"));
                    switch (j) {
                    case 1:
                        map.put("nsrmc",value );
                        break;
                    case 2:
                        map.put("type", value);
                        break;
                    case 3:
                        map.put("sdate", value);
                        break;
                    case 4:
                        map.put("edate", value);
                        break;
                    case 5:
                        map.put("sign", value);
                        break;
                    default:
                        break;
                    }
                }
                mapList.add(map);
            }
            re.put("totalPage_str", totalPage_str);
            re.put("result", mapList);
        }else{//未查询到结果

        }
        return re;
    }
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值