网络爬虫-网页源码的gzip压缩和解压

最新推荐文章于 2023-05-16 14:50:30 发布

醉后知酒浓先生

最新推荐文章于 2023-05-16 14:50:30 发布

阅读量2.9k

点赞数

分类专栏：网络爬虫文章标签：网络爬虫 java

本文链接：https://blog.csdn.net/zx1749623383/article/details/73649863

版权

网络爬虫专栏收录该内容

8 篇文章 0 订阅

订阅专栏

在做爬虫程序的时候，为了减轻数据库的压力，通常会使用缓存技术。对抓取链接或者网页源码要进行缓存，对网页源码缓存的时候通常要对网页源码进行压缩和解压。
下面是用Java实现对网页源码的压缩和解压的方法：

public class GzipUtil {
    /**
     * gZip压缩方法
     */
    public static byte[] gZip(String data , String charset) {
        if(data==null || data.isEmpty()){
            return null;
        }
        byte[] b = null;
        try {
            ByteArrayOutputStream bos = new ByteArrayOutputStream();
            GZIPOutputStream gzip = new GZIPOutputStream(bos);
            gzip.write(data.getBytes(charset));
            gzip.finish();
            gzip.close();
            b = bos.toByteArray();
            bos.close();
        } catch (Exception ex) {
            ex.printStackTrace();
        }
        return b;
    }

    /**
     * gZip解压方法
     */
    public static String unGZip(byte[] data , String charset) {
        if(data==null){
            return "";
        }
        try {
            ByteArrayInputStream bis = new ByteArrayInputStream(data);
            GZIPInputStream gzip = new GZIPInputStream(bis);
            byte[] buf = new byte[1024];
            int num = -1;
            ByteArrayOutputStream baos = new ByteArrayOutputStream();
            while ((num = gzip.read(buf, 0, buf.length)) != -1) {
                baos.write(buf, 0, num);
            }
            byte[] b = baos.toByteArray();
            baos.flush();
            baos.close();
            gzip.close();
            bis.close();
            return new String(b , charset);
        } catch (Exception ex) {
            ex.printStackTrace();
        }
        return "";
    }

    public static void main(String[] args) throws Exception {
        String str = SendGet.sendGet("http://blog.csdn.net/", "", "utf-8");
        String charset = "gbk";
        byte[] src = str.getBytes(charset); 
        System.out.println("压缩前：" + str);
        System.out.println("大小 : "+ src.length);
        byte[] parames = gZip(str , charset);
        System.out.println("压缩后:" + parames);
        System.out.println(parames.length);
        String after = unGZip(parames , charset);
        System.out.println("解压后：" + after);
    }
}