根据HTTP和HTML中的字符集下载网页

参考文章:

http://stulance.iteye.com/blog/1740524

http://blog.csdn.net/it_magician/article/details/9240727

http://hc.apache.org/httpcomponents-client-4.5.x/tutorial/html/fundamentals.html#d5e199


    /**
     * 根据url下载文件,保存到文件中,文件全路径为filename
     * @param url 下载URL
     * @param filename 完整的文件名
     * @return 文件的文本内容
     */
    public static String downloadFile(String url, String filename,
                                      CloseableHttpClient client, HttpClientContext context) {
        String content = null;
        try {
            if(client == null) {
                return "";
            }

            HttpGet httpGet = new HttpGet(url);
            try {
                content = client.execute(httpGet,rh,context);
            } catch (HttpResponseException hrex) {
                return null;
            } catch (ClientProtocolException cpex) {
                return null;
            }

            if(content == null) {
                return null;
            }

            File file = new File(filename);
            file.getParentFile().mkdirs();
            BCFileUtils.writeFile(content,filename);

            // 尾文件增加BOM头
            BCFileUtils.addBOMHead(file);
        } catch (HttpHostConnectException exHHCE) {
            CrawlLogger.CrawlInfo("[ERROR]url=" + url + " connection failed");
            return null;
        } catch(UnknownHostException exUHE) {
            CrawlLogger.CrawlInfo("[ERROR]url=" + url + " domain name parsing failed");
            return null;
        } catch (Exception e) {
            CrawlLogger.ErrInfo(BCWebUtils.class,e);
            return null;
        }
        return content;
    }

    // 使用ResponseHandler读取内容
    private static ResponseHandler<String> rh = new ResponseHandler<String>() {
        @Override
        public String handleResponse(
                final HttpResponse response) throws IOException {
            StatusLine statusLine = response.getStatusLine();
            HttpEntity entity = response.getEntity();
            if (statusLine.getStatusCode() >= 300) {
                throw new HttpResponseException(
                        statusLine.getStatusCode(),
                        statusLine.getReasonPhrase());
            }
            if (entity == null) {
                throw new ClientProtocolException("Response contains no content");
            }

            ByteArrayOutputStream baos = new ByteArrayOutputStream();
            byte[] buffer = new byte[4096];
            int len;
            while ((len = entity.getContent().read(buffer)) > -1 ) {
                baos.write(buffer, 0, len);
            }
            baos.flush();

            ContentType contentType = ContentType.getOrDefault(entity);
            Charset charset = contentType.getCharset();
            try {
                if(charset == null) {
                    Reader reader = new InputStreamReader(new ByteArrayInputStream(baos.toByteArray()), "UTF-8");
                    char[] charBuffer = new char[4096];
                    int c = 0;
                    StringBuilder content = new StringBuilder();
                    while ((c = reader.read(charBuffer)) != -1) {
                        content.append(charBuffer,0,c);
                    }
                    String charsetFromContent = getCharset(content.toString());
                    charset = Charset.forName(charsetFromContent);
                    if(charset == null) {
                        charset = Charset.defaultCharset();
                    }
                }
            } catch (Exception ex) {
                charset = Charset.defaultCharset();
                CrawlLogger.ErrInfo(BCWebUtils.class,ex);
            }
            Reader reader = new InputStreamReader(new ByteArrayInputStream(baos.toByteArray()), charset);
            char[] charBuffer = new char[4096];
            int c = 0;
            StringBuilder content = new StringBuilder();
            while ((c = reader.read(charBuffer)) != -1) {
                content.append(charBuffer,0,c);
            }

            //System.out.print(content.toString());

            return content.toString();
        }
    };

    public static String getCharset(String content) throws Exception {
        String charset = getCharsetFromContent(content);
        if (charset == null) {
            charset = getCharsetFromMeta(content);
        }
        return charset;
    }

    public static String getCharsetFromContent(String content) throws IOException {
        String pattern = "\\<meta\\s*http-equiv=[\\\"\\']content-type[\\\"\\']\\s*content\\s*=\\s*[\"']" +
                "text/html\\s*;\\s*charset=([a-z\\d\\-]*)[\\\"\\'\\>]";
        Matcher matcher = Pattern.compile(pattern,  Pattern.CASE_INSENSITIVE).matcher(content);
        if (matcher.find()) {
            String charset = matcher.group(1);
            if (Charset.isSupported(charset)) {
                return charset;
            }
        }

        return null;
    }

    public static String getCharsetFromMeta(String content) throws Exception {
        String pattern = "\\<meta\\s*[\\\"\\']charset=([a-z\\d\\-]*)[\\\"\\'\\>]";
        Matcher matcher = Pattern.compile(pattern,  Pattern.CASE_INSENSITIVE).matcher(content);
        if (matcher.find()) {
            String charset = matcher.group(1);
            if (Charset.isSupported(charset)) {
                return charset;
            }
        }

        return null;
    }


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值