小文件合并的逻辑
- 过滤掉.svn格式的文件
- 循环所有文件, 通过globalStatus获取所有.txt格式路径
- 通过IOUtils.copyByBytes()将数据集合并为7个文件
- 上传至HDFS
代码
public class CombineSmallFile {
private static Configuration conf;
/**
* 封装的过滤的类
* @author andy
*
*/
public static class RegexPathFilter implements PathFilter{
public static final int MOOD_FILTER = 0;
public static final int MOOD_SELECT = 1;
private final String regex;
private final int mood;
public RegexPathFilter(String regex, int mood) {
this.regex = regex;
this.mood = mood;
}
@Override
public boolean accept(Path path) {
boolean flag = false;
if(mood==0){
flag = !path.toString().matches(regex);
}else{
flag = path.toString().matches(regex);
}
return flag;
}
}
/**
* 合并上传小文件
*
* @param srcPath 要输入的文件的目录路径
* @param dstPath 要输出的文件的目录
* @throws IOException
* @throws URISyntaxException
*/
public static void combineFileUploadList(String srcPath, String dstPath) throws IOException, URISyntaxException{
FileSystem fs = FileSystem.get(conf);
LocalFileSystem local = FileSystem.getLocal(conf);
FileStatus[] dirStatus = local.globStatus(new Path(srcPath), new RegexPathFilter("^.*svn$", RegexPathFilter.MOOD_FILTER));
Path[] paths = FileUtil.stat2Paths(dirStatus);
FSDataInputStream fsdis = null;
FSDataOutputStream fsdos = null;
if(!(fs.exists(new Path(dstPath)))){
fs.mkdirs(new Path(dstPath));
}
for (Path dir : paths) {
String fileName = dir.getName();
FileStatus[] localStatus = local.globStatus(new Path(srcPath+fileName+"/*"), new RegexPathFilter("^.*txt$", RegexPathFilter.MOOD_SELECT));
Path[] listedPaths = FileUtil.stat2Paths(localStatus);
Path block = new Path(dstPath+getFileName(fileName));
if(!(fs.exists(block))){
fsdos = fs.create(block);
}else{
fsdos = fs.append(block);
}
for (Path path : listedPaths) {
fsdis = local.open(path);
IOUtils.copyBytes(fsdis, fsdos, 4096, false);
fsdis.close();
}
fsdos.close();
}
fs.close();
}
/**
* 根据文件夹的名字得到合并后的文件名
* 2012-09-17 -> 20130917.txt
* @param dirName
* @return
*/
private static String getFileName(String dirName) {
String repStr = dirName.replace("-", "");
return repStr+".txt";
}
/**
* 做一些初始化的操作
*/
public static void init(){
if(conf==null)conf = new Configuration();
}
public static void main(String[] args) throws IOException, URISyntaxException {
String source = "E://项目资料/hadoop/data/73/*";
String dest = "hdfs://hy:9000/hy/tv/";
init();
combineFileUploadList(source, dest);
}
}