解压缩tar.gz文件

题目如下:

定义文件xx.tar.gz的产生方式如下:
•以xx为文件名的文件通过tar 和gzip打包压缩产生,该文件中以字符串的方式记录了一个非负整数;
•或者以xx为名的目录通过tar和gzip打包压缩产生,该目录中包含若干xx.tar.gz。
其中,x2[0, 9]。

现给定一个根据上述定义生成的文件00.tar.gz,请确定其中包含的以xx为文件名的文件个数以及这些文件中所记录的非负整数之和。

解压tar.gz文件需要用到的包:

<dependency>
			<groupId>org.apache.commons</groupId>
			<artifactId>commons-compress</artifactId>
			<version>1.8</version>
		</dependency>

 

最初的解法:

package cn.com.learners;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.zip.GZIPInputStream;

import org.apache.commons.compress.archivers.ArchiveInputStream;
import org.apache.commons.compress.archivers.ArchiveStreamFactory;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.junit.Test;

/**
 * 
 * @author Jason Li 2014-5
 * 
 * 1. 将压缩包中的文件全部解压出来,返回压缩包中的文件列表
 * 2. 遍历列表:
 * 		如果是文件则读取其中的数字,加到sum中
 *		如果是压缩包,则递归步骤1,压缩包数量加1
 *
 */
public class TarGz {

	private final static String FileSeparator = "/";
	private int sum = 0;
	private int tarfilecount = 0;

	@Test
	public void test() {

		long start = System.currentTimeMillis();

		try {
			count("v:", "00.tar.gz");
		} catch (Exception e) {
			e.printStackTrace();
		}

		System.out.println("sum: " + sum);
		System.out.println("tar.gz file count:" + tarfilecount);

		long end = System.currentTimeMillis();
		System.out.println("time:" + (end - start));
	}

	public void count(String filePath, String fileName) throws Exception {

		List<String> filelist = extractTarGzFile(filePath, fileName);

		for (String s : filelist) {

			if (s.endsWith("tar.gz")) {

				tarfilecount++;

				String path = s.substring(0, s.lastIndexOf(FileSeparator));

				String name = s.substring(s.lastIndexOf(FileSeparator) + 1,
						s.length());
				count(path, name);
			} else {
				sum = sum + readIntFromFile(s);
			}

		}

	}

	private int readIntFromFile(String s) throws Exception {

		int result = 0;

		FileReader f = new FileReader(s);
		BufferedReader br = new BufferedReader(f);
		result = result + Integer.parseInt(br.readLine());
		br.close();
		f.close();

		return result;

	}

	private List<String> extractTarGzFile(String filePath, String fileName)
			throws Exception {

		InputStream is = new FileInputStream(filePath + FileSeparator
				+ fileName);

		List<String> filelist = new ArrayList<String>();
		ArchiveInputStream ais = null;
		GZIPInputStream gis = null;

		gis = new GZIPInputStream(new BufferedInputStream(is));
		ais = new ArchiveStreamFactory().createArchiveInputStream("tar", gis);

		TarArchiveEntry entry = null;
		while ((entry = (TarArchiveEntry) ais.getNextEntry()) != null) {

			// System.out.println(entry.getName());

			String[] names = entry.getName().split("/");
			String name = filePath;

			for (int i = 0; i < names.length; i++) {
				String str = names[i];
				name = name + FileSeparator + str;
			}

			if (entry.getName().endsWith("/")) {
				mkFolder(name);
			} else {

				filelist.add(name);

				File file = mkFile(name);
				BufferedOutputStream bufferedOutputStream = new BufferedOutputStream(
						new FileOutputStream(file));

				int b;
				while ((b = ais.read()) != -1) {
					bufferedOutputStream.write(b);
				}

				bufferedOutputStream.flush();
				bufferedOutputStream.close();
			}

		}

		ais.close();
		gis.close();
		is.close();

		return filelist;
	}

	private void mkFolder(String fileName) {

		File f = new File(fileName);

		if (!f.exists()) {

			f.mkdir();

		}

	}

	private File mkFile(String fileName) throws IOException {

		File f = new File(fileName);

		f.createNewFile();

		return f;

	}

}

 运行结果:

sum: 38454840
tar.gz file count:15442
time:36709

 

程序运行很慢,大部分时间都浪费在了IO上面,因为小文件非常多,这是运行完成后的文件及目录统计:

 

 

解法二,不再新文件真的解压,而是直接把压缩文件中的内容按流进行操作

package cn.com.learners;

import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.util.zip.GZIPInputStream;

import org.apache.commons.compress.archivers.ArchiveInputStream;
import org.apache.commons.compress.archivers.ArchiveStreamFactory;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.io.FileUtils;
import org.junit.Test;
/**
 * 
 * @author Jason Li 2014-5
 * 
 * 不再把文件都解压出来,全部在操作数据流
 *
 */
public class TarGz_Recursion {

	private static int sum = 0;
	private static int tarfilecount = 0;

	@Test
	public void test() {

		long start = System.currentTimeMillis();

		try {
			File f = new File("v:/00.tar.gz");
			ByteArrayInputStream bais = new ByteArrayInputStream(
					FileUtils.readFileToByteArray(f));
			extractTarGzFile(bais);
			bais.close();
		} catch (Exception e) {
			e.printStackTrace();
		}
		
		System.out.println("sum: " + sum);
		System.out.println("tar.gz file count:" + tarfilecount);

		long end = System.currentTimeMillis();
		System.out.println("time:" + (end - start));
	}

	private void extractTarGzFile(ByteArrayInputStream is) throws Exception {

		GZIPInputStream gis = new GZIPInputStream(new BufferedInputStream(is));
		ArchiveInputStream ais = new ArchiveStreamFactory()
				.createArchiveInputStream("tar", gis);
		TarArchiveEntry entry = null;
		//循环处理压缩包中的项目
		while ((entry = (TarArchiveEntry) ais.getNextEntry()) != null) {
			// 不是目录时才需要进行处理
			if (!entry.getName().endsWith("/")) {
				ByteArrayOutputStream baos = new ByteArrayOutputStream();
				byte[] buffer = new byte[4096];
				int bytesRead = -1;
				while ((bytesRead = ais.read(buffer)) != -1) {
					baos.write(buffer, 0, bytesRead);
				}
				if (entry.getName().endsWith("tar.gz")) {
					// 处理压缩包
					tarfilecount++;
					extractTarGzFile(new ByteArrayInputStream(
							baos.toByteArray()));
				} else {
					// 处理包含int数值的文件
					sum = sum + Integer.parseInt((baos.toString()));
				}
			}			
		}
		//压缩包处理完毕
		ais.close();
		gis.close();
		is.close();
	}
}

 

这时的运行结果,速度杠杠的:

sum: 38454840
tar.gz file count:15442
time:1561

 

还有什么方法可以再将速度进一步提高吗?这里已经全部是内存中操作了,但这里递归只是单线程运行,进一步提高速度,只能是用多线程。

解法三,多线程解压

package cn.com.learners;

import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.zip.GZIPInputStream;

import org.apache.commons.compress.archivers.ArchiveInputStream;
import org.apache.commons.compress.archivers.ArchiveStreamFactory;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.io.FileUtils;
import org.junit.Test;
/**
 * 
 * @author Jason Li 2014-5
 * 多线程解压
 *
 */
public class TarGz_ThreadPoolAtomic {

	private AtomicInteger sum = new AtomicInteger(0);
	private AtomicInteger count = new AtomicInteger(0);
	private AtomicInteger ThreadCount = new AtomicInteger(0);
	
	@Test
	public void test() {

		long start = System.currentTimeMillis();
		try {
			ExecutorService pool = Executors.newFixedThreadPool(Runtime
					.getRuntime().availableProcessors()*2);
						
			File f = new File("v:/00.tar.gz");
			ByteArrayInputStream bais = new ByteArrayInputStream(
					FileUtils.readFileToByteArray(f));
			Thread t = new CountThread(bais, pool);
			pool.execute(t);

			while (t.isAlive() || ThreadCount.get() > 0) {
//				 System.out.println(ThreadCount.get());
			}

			System.out.println("sum: " + sum.get());
			System.out.println("tar.gz files count:" + count.get());

		} catch (Exception e) {
			e.printStackTrace();
		}

		long end = System.currentTimeMillis();
		System.out.println("time:" + (end - start));
	}


	class CountThread extends Thread {

		private ByteArrayInputStream is;
		private ExecutorService pool;

		public CountThread(ByteArrayInputStream is, ExecutorService pool) {
			this.pool = pool;
			this.is = is;
			ThreadCount.incrementAndGet();
		}

		public void run() {
			try {
				
				extractTarGzFile(is);
				ThreadCount.decrementAndGet();
			} catch (Exception e) {
				e.printStackTrace();
			}
		}

		private void extractTarGzFile(ByteArrayInputStream is) throws Exception {

			ArchiveInputStream ais = null;
			GZIPInputStream gis = null;

			gis = new GZIPInputStream(new BufferedInputStream(is));
			ais = new ArchiveStreamFactory().createArchiveInputStream("tar",
					gis);

			TarArchiveEntry entry = null;
			while ((entry = (TarArchiveEntry) ais.getNextEntry()) != null) {

				// 不是目录时才需要进行处理
				if (!entry.getName().endsWith("/")) {
					ByteArrayOutputStream baos = new ByteArrayOutputStream();

					byte[] buffer = new byte[4096];
					int bytesRead = -1;
					while ((bytesRead = ais.read(buffer)) != -1) {
						baos.write(buffer, 0, bytesRead);
					}
					ByteArrayInputStream bais = new ByteArrayInputStream(
							baos.toByteArray());

					if (entry.getName().endsWith("tar.gz")) {
						// 处理压缩包
						count.incrementAndGet();
						pool.execute(new CountThread(bais, this.pool));
					} else {
						// 处理包含int数值的文件
						sum.addAndGet(Integer.parseInt(baos.toString()));
					}
				}
			}

			ais.close();
			gis.close();
			is.close();

		}

	}

}

这时的运行速度已经非常满意了,这可是好几万个文件:

sum: 38454840
tar.gz files count:15442
time:929

 

 


 

 

 

转载于:https://my.oschina.net/jasonli0102/blog/279931

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值