在做爬虫程序的时候,为了减轻数据库的压力,通常会使用缓存技术。对抓取链接或者网页源码要进行缓存,对网页源码缓存的时候通常要对网页源码进行压缩和解压。
下面是用Java实现对网页源码的压缩和解压的方法:
public class GzipUtil {
/**
* gZip压缩方法
*/
public static byte[] gZip(String data , String charset) {
if(data==null || data.isEmpty()){
return null;
}
byte[] b = null;
try {
ByteArrayOutputStream bos = new ByteArrayOutputStream();
GZIPOutputStream gzip = new GZIPOutputStream(bos);
gzip.write(data.getBytes(charset));
gzip.finish();
gzip.close();
b = bos.toByteArray();
bos.close();
} catch (Exception ex) {
ex.printStackTrace();
}
return b;
}
/**
* gZip解压方法
*/
public static String unGZip(byte[] data , String charset) {
if(data==null){
return "";
}
try {
ByteArrayInputStream bis = new ByteArrayInputStream(data);
GZIPInputStream gzip = new GZIPInputStream(bis);
byte[] buf = new byte[1024];
int num = -1;
ByteArrayOutputStream baos = new ByteArrayOutputStream();
while ((num = gzip.read(buf, 0, buf.length)) != -1) {
baos.write(buf, 0, num);
}
byte[] b = baos.toByteArray();
baos.flush();
baos.close();
gzip.close();
bis.close();
return new String(b , charset);
} catch (Exception ex) {
ex.printStackTrace();
}
return "";
}
public static void main(String[] args) throws Exception {
String str = SendGet.sendGet("http://blog.csdn.net/", "", "utf-8");
String charset = "gbk";
byte[] src = str.getBytes(charset);
System.out.println("压缩前:" + str);
System.out.println("大小 : "+ src.length);
byte[] parames = gZip(str , charset);
System.out.println("压缩后:" + parames);
System.out.println(parames.length);
String after = unGZip(parames , charset);
System.out.println("解压后:" + after);
}
}