基于JAVA语言的HDFS文件过与合滤并
一、创建所需文件
1. 运行Hadoop
sbin/start-dfs.sh
2. 在路径为[/user/hadoop]下创建file文件夹,用来保存我们的数据文件
./bin/hdfs dfs -mkdir /user/hadoop/file
3. 用下面命令分别在file文件夹创建四个文件
./bin/hdfs dfs -touchz /user/hadoop/file/file1.txt
./bin/hdfs dfs -touchz /user/hadoop/file/file2.txt
./bin/hdfs dfs -touchz /user/hadoop/file/file3.txt
./bin/hdfs dfs -touchz /user/hadoop/file/file4.abc
4. 向刚刚创建的文件分别写入数据
echo "Welcome" | ./bin/hdfs dfs -appendToFile - /user/hadoop/file/file1.txt
echo "to" | ./bin/hdfs dfs -appendToFile - /user/hadoop/file/file2.txt
echo "Hadoop" | ./bin/hdfs dfs -appendToFile - /user/hadoop/file/file3.txt
echo "too" | ./bin/hdfs dfs -appendToFile - /user/hadoop/file/file4.abc
用一下命令可以查看数据内容:
./bin/hdfs dfs -cat /user/hadoop/file/file1.txt
以上的创建图为
5.创建文件Merge.txt,用于合并接收过滤后的文件
./bin/hdfs dfs -touchz /user/hadoop/input/Merge.txt
二、编写java程序,完成HDFS文件过与合滤并
1.代码如下
package mergeFile;
import java.io.IOException;
import java.io.PrintStream;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
class myPathFilter implements PathFilter{ //过滤掉文件名满足特定条件的文件
String reg = null;
myPathFilter(String reg){
this.reg = reg;
}
public boolean accept(Path path) {
if(!(path.toString().matches(reg)))
return true;
return false;
}
}
public class merge {
Path inputPath = null; //待合并的文件所在的目录的路径
Path outputPath = null; //输出文件的路径
public merge(String input, String output){
this.inputPath = new Path(input);
this.outputPath = new Path(output);
}
public void doMerge() throws IOException{
Configuration conf = new Configuration();
conf.set("fs.defaultFS","hdfs://localhost:9000" );
conf.set("fs.hdfs.impl", "org.apache.hadoop.hdfs.DistributedFileSystem");
FileSystem fsSource = FileSystem.get(URI.create(inputPath.toString()),conf);
FileSystem fsDst = FileSystem.get(URI.create(outputPath.toString()),conf);
FileStatus[] sourceStatus = fsSource.listStatus(inputPath, new myPathFilter(".*\\.abc")); //过滤掉目录中后缀为.abc的文件
FSDataOutputStream fsdos = fsDst.create(outputPath);
//下面分别读取过滤之后的每个文件的内容,并输出到同一个文件中
for(FileStatus sta:sourceStatus){
System.out.println("路径: " + sta.getPath() + " 文件大小: " + sta.getLen() + " 权限: " + sta.getPermission() + " 内容: ");
FSDataInputStream fsdis = fsSource.open(sta.getPath());
byte[] data = new byte[1024];
int read = -1;
PrintStream ps = new PrintStream(System.out);
while((read = fsdis.read(data)) > 0){
ps.write(data, 0, read);
fsdos.write(data, 0, read);
}
}
fsdos.close();
}
public static void main(String args[]) throws IOException{
merge merge = new merge("hdfs://localhost:9000/user/hadoop/file", "hdfs://localhost:9000/user/hadoop/input/Merge.txt");
merge.doMerge();
}
}
2.运行结果:
3.在终端利用cat命令查看Merge.txt文件内容
./bin/hdfs dfs -cat /user/hadoop/input/Merge.txt
说明已将将文件file1.txt、file2txt、file3.txt合并了,并过滤了file.abc文件