题目如下:
定义文件xx.tar.gz的产生方式如下:
•以xx为文件名的文件通过tar 和gzip打包压缩产生,该文件中以字符串的方式记录了一个非负整数;
•或者以xx为名的目录通过tar和gzip打包压缩产生,该目录中包含若干xx.tar.gz。
其中,x2[0, 9]。
现给定一个根据上述定义生成的文件00.tar.gz,请确定其中包含的以xx为文件名的文件个数以及这些文件中所记录的非负整数之和。
解压tar.gz文件需要用到的包:
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId>
<version>1.8</version>
</dependency>
最初的解法:
package cn.com.learners;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.zip.GZIPInputStream;
import org.apache.commons.compress.archivers.ArchiveInputStream;
import org.apache.commons.compress.archivers.ArchiveStreamFactory;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.junit.Test;
/**
*
* @author Jason Li 2014-5
*
* 1. 将压缩包中的文件全部解压出来,返回压缩包中的文件列表
* 2. 遍历列表:
* 如果是文件则读取其中的数字,加到sum中
* 如果是压缩包,则递归步骤1,压缩包数量加1
*
*/
public class TarGz {
private final static String FileSeparator = "/";
private int sum = 0;
private int tarfilecount = 0;
@Test
public void test() {
long start = System.currentTimeMillis();
try {
count("v:", "00.tar.gz");
} catch (Exception e) {
e.printStackTrace();
}
System.out.println("sum: " + sum);
System.out.println("tar.gz file count:" + tarfilecount);
long end = System.currentTimeMillis();
System.out.println("time:" + (end - start));
}
public void count(String filePath, String fileName) throws Exception {
List<String> filelist = extractTarGzFile(filePath, fileName);
for (String s : filelist) {
if (s.endsWith("tar.gz")) {
tarfilecount++;
String path = s.substring(0, s.lastIndexOf(FileSeparator));
String name = s.substring(s.lastIndexOf(FileSeparator) + 1,
s.length());
count(path, name);
} else {
sum = sum + readIntFromFile(s);
}
}
}
private int readIntFromFile(String s) throws Exception {
int result = 0;
FileReader f = new FileReader(s);
BufferedReader br = new BufferedReader(f);
result = result + Integer.parseInt(br.readLine());
br.close();
f.close();
return result;
}
private List<String> extractTarGzFile(String filePath, String fileName)
throws Exception {
InputStream is = new FileInputStream(filePath + FileSeparator
+ fileName);
List<String> filelist = new ArrayList<String>();
ArchiveInputStream ais = null;
GZIPInputStream gis = null;
gis = new GZIPInputStream(new BufferedInputStream(is));
ais = new ArchiveStreamFactory().createArchiveInputStream("tar", gis);
TarArchiveEntry entry = null;
while ((entry = (TarArchiveEntry) ais.getNextEntry()) != null) {
// System.out.println(entry.getName());
String[] names = entry.getName().split("/");
String name = filePath;
for (int i = 0; i < names.length; i++) {
String str = names[i];
name = name + FileSeparator + str;
}
if (entry.getName().endsWith("/")) {
mkFolder(name);
} else {
filelist.add(name);
File file = mkFile(name);
BufferedOutputStream bufferedOutputStream = new BufferedOutputStream(
new FileOutputStream(file));
int b;
while ((b = ais.read()) != -1) {
bufferedOutputStream.write(b);
}
bufferedOutputStream.flush();
bufferedOutputStream.close();
}
}
ais.close();
gis.close();
is.close();
return filelist;
}
private void mkFolder(String fileName) {
File f = new File(fileName);
if (!f.exists()) {
f.mkdir();
}
}
private File mkFile(String fileName) throws IOException {
File f = new File(fileName);
f.createNewFile();
return f;
}
}
运行结果:
sum: 38454840
tar.gz file count:15442
time:36709
程序运行很慢,大部分时间都浪费在了IO上面,因为小文件非常多,这是运行完成后的文件及目录统计:
解法二,不再新文件真的解压,而是直接把压缩文件中的内容按流进行操作
package cn.com.learners;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.util.zip.GZIPInputStream;
import org.apache.commons.compress.archivers.ArchiveInputStream;
import org.apache.commons.compress.archivers.ArchiveStreamFactory;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.io.FileUtils;
import org.junit.Test;
/**
*
* @author Jason Li 2014-5
*
* 不再把文件都解压出来,全部在操作数据流
*
*/
public class TarGz_Recursion {
private static int sum = 0;
private static int tarfilecount = 0;
@Test
public void test() {
long start = System.currentTimeMillis();
try {
File f = new File("v:/00.tar.gz");
ByteArrayInputStream bais = new ByteArrayInputStream(
FileUtils.readFileToByteArray(f));
extractTarGzFile(bais);
bais.close();
} catch (Exception e) {
e.printStackTrace();
}
System.out.println("sum: " + sum);
System.out.println("tar.gz file count:" + tarfilecount);
long end = System.currentTimeMillis();
System.out.println("time:" + (end - start));
}
private void extractTarGzFile(ByteArrayInputStream is) throws Exception {
GZIPInputStream gis = new GZIPInputStream(new BufferedInputStream(is));
ArchiveInputStream ais = new ArchiveStreamFactory()
.createArchiveInputStream("tar", gis);
TarArchiveEntry entry = null;
//循环处理压缩包中的项目
while ((entry = (TarArchiveEntry) ais.getNextEntry()) != null) {
// 不是目录时才需要进行处理
if (!entry.getName().endsWith("/")) {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
byte[] buffer = new byte[4096];
int bytesRead = -1;
while ((bytesRead = ais.read(buffer)) != -1) {
baos.write(buffer, 0, bytesRead);
}
if (entry.getName().endsWith("tar.gz")) {
// 处理压缩包
tarfilecount++;
extractTarGzFile(new ByteArrayInputStream(
baos.toByteArray()));
} else {
// 处理包含int数值的文件
sum = sum + Integer.parseInt((baos.toString()));
}
}
}
//压缩包处理完毕
ais.close();
gis.close();
is.close();
}
}
这时的运行结果,速度杠杠的:
sum: 38454840
tar.gz file count:15442
time:1561
还有什么方法可以再将速度进一步提高吗?这里已经全部是内存中操作了,但这里递归只是单线程运行,进一步提高速度,只能是用多线程。
解法三,多线程解压
package cn.com.learners;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.zip.GZIPInputStream;
import org.apache.commons.compress.archivers.ArchiveInputStream;
import org.apache.commons.compress.archivers.ArchiveStreamFactory;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.io.FileUtils;
import org.junit.Test;
/**
*
* @author Jason Li 2014-5
* 多线程解压
*
*/
public class TarGz_ThreadPoolAtomic {
private AtomicInteger sum = new AtomicInteger(0);
private AtomicInteger count = new AtomicInteger(0);
private AtomicInteger ThreadCount = new AtomicInteger(0);
@Test
public void test() {
long start = System.currentTimeMillis();
try {
ExecutorService pool = Executors.newFixedThreadPool(Runtime
.getRuntime().availableProcessors()*2);
File f = new File("v:/00.tar.gz");
ByteArrayInputStream bais = new ByteArrayInputStream(
FileUtils.readFileToByteArray(f));
Thread t = new CountThread(bais, pool);
pool.execute(t);
while (t.isAlive() || ThreadCount.get() > 0) {
// System.out.println(ThreadCount.get());
}
System.out.println("sum: " + sum.get());
System.out.println("tar.gz files count:" + count.get());
} catch (Exception e) {
e.printStackTrace();
}
long end = System.currentTimeMillis();
System.out.println("time:" + (end - start));
}
class CountThread extends Thread {
private ByteArrayInputStream is;
private ExecutorService pool;
public CountThread(ByteArrayInputStream is, ExecutorService pool) {
this.pool = pool;
this.is = is;
ThreadCount.incrementAndGet();
}
public void run() {
try {
extractTarGzFile(is);
ThreadCount.decrementAndGet();
} catch (Exception e) {
e.printStackTrace();
}
}
private void extractTarGzFile(ByteArrayInputStream is) throws Exception {
ArchiveInputStream ais = null;
GZIPInputStream gis = null;
gis = new GZIPInputStream(new BufferedInputStream(is));
ais = new ArchiveStreamFactory().createArchiveInputStream("tar",
gis);
TarArchiveEntry entry = null;
while ((entry = (TarArchiveEntry) ais.getNextEntry()) != null) {
// 不是目录时才需要进行处理
if (!entry.getName().endsWith("/")) {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
byte[] buffer = new byte[4096];
int bytesRead = -1;
while ((bytesRead = ais.read(buffer)) != -1) {
baos.write(buffer, 0, bytesRead);
}
ByteArrayInputStream bais = new ByteArrayInputStream(
baos.toByteArray());
if (entry.getName().endsWith("tar.gz")) {
// 处理压缩包
count.incrementAndGet();
pool.execute(new CountThread(bais, this.pool));
} else {
// 处理包含int数值的文件
sum.addAndGet(Integer.parseInt(baos.toString()));
}
}
}
ais.close();
gis.close();
is.close();
}
}
}
这时的运行速度已经非常满意了,这可是好几万个文件:
sum: 38454840
tar.gz files count:15442
time:929