//函数功能:将本地的大量文件cat到一起并上传至HDFS上
1.mkdir ./src
2.代码 ./src/PutMerge.java
package com.sdn.hadoop;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
public class PutMerge{
public static void main(String[] args) throws IOException{
Configuration conf = new Configuration();
//get FileSystem object of HDFS
FileSystem hdfs = FileSystem.get(conf);
//get FileSystem object of local file system
FileSystem local = FileSystem.getLocal(conf);
Path inputDir = new Path(args[0]);
Path hdfsFile = new Path(args[1]);
try{
// get file list of inputDir
FileStatus[] inputFiles = local.listStatus(inputDir);
FSDataOutputStream out = hdfs.create(hdfsFile);
for(int i=0;i<inputFiles.length;i++){
System.out.println(inputFiles[i].getPath().getName());
FSDataInputStream in = local.open(inputFiles[i].getPath());
byte buffer[] = new byte[256];
int bytesRead = 0;
while( (bytesRead = in.read(buffer)) > 0){
out.write(buffer, 0, bytesRead);
}
in.close();
}
out.close();
}catch (IOException e){
e.printStackTrace();
}
}
}
3.在./data 目录下准备输入数据,并放入待cat到一起的文件
4.编译java
javac -classpath /usr/local/hadoop-0.20.203.0/hadoop-core-0.20.203.0.jar:/usr/local/hadoop-0.20.203.0/lib/commons-cli-1.2.jar -d classes src/PutMerge.java
-classpath 是指定源代码中使用的各种类的库文件路径,中间以:分隔,-d 指定生成的class文件的放置路径
由于指定了package,因此其class文件是位于./classes/com/sdn/hadoop目录下的
5.打包
jar -cvf PutMerge.jar -C classes/ .
-C 表示在执行jar的时候要切换至classes目录,该目录下有编译好的class文件
6.执行
hadoop jar PutMerge.jar com.sdn.hadoop.PutMerge ./data /dw_ext/recmd/putmerge.data
输入路径:./data
输出文件:/dw_ext/recmd/putmerge.data
7.查看结果
hadoop fs -cat /dw_ext/recmd/putmerge.data
该文件就是将data目录下的所有文件cat到一起后生成的新文件
1.mkdir ./src
2.代码 ./src/PutMerge.java
package com.sdn.hadoop;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
public class PutMerge{
public static void main(String[] args) throws IOException{
Configuration conf = new Configuration();
//get FileSystem object of HDFS
FileSystem hdfs = FileSystem.get(conf);
//get FileSystem object of local file system
FileSystem local = FileSystem.getLocal(conf);
Path inputDir = new Path(args[0]);
Path hdfsFile = new Path(args[1]);
try{
// get file list of inputDir
FileStatus[] inputFiles = local.listStatus(inputDir);
FSDataOutputStream out = hdfs.create(hdfsFile);
for(int i=0;i<inputFiles.length;i++){
System.out.println(inputFiles[i].getPath().getName());
FSDataInputStream in = local.open(inputFiles[i].getPath());
byte buffer[] = new byte[256];
int bytesRead = 0;
while( (bytesRead = in.read(buffer)) > 0){
out.write(buffer, 0, bytesRead);
}
in.close();
}
out.close();
}catch (IOException e){
e.printStackTrace();
}
}
}
3.在./data 目录下准备输入数据,并放入待cat到一起的文件
4.编译java
javac -classpath /usr/local/hadoop-0.20.203.0/hadoop-core-0.20.203.0.jar:/usr/local/hadoop-0.20.203.0/lib/commons-cli-1.2.jar -d classes src/PutMerge.java
-classpath 是指定源代码中使用的各种类的库文件路径,中间以:分隔,-d 指定生成的class文件的放置路径
由于指定了package,因此其class文件是位于./classes/com/sdn/hadoop目录下的
5.打包
jar -cvf PutMerge.jar -C classes/ .
-C 表示在执行jar的时候要切换至classes目录,该目录下有编译好的class文件
6.执行
hadoop jar PutMerge.jar com.sdn.hadoop.PutMerge ./data /dw_ext/recmd/putmerge.data
输入路径:./data
输出文件:/dw_ext/recmd/putmerge.data
7.查看结果
hadoop fs -cat /dw_ext/recmd/putmerge.data
该文件就是将data目录下的所有文件cat到一起后生成的新文件
PS:
源代码中未加入package com.sdn.hadoop 时,直接执行hadoop jar PutMerge.jar ./data /dw_ext/recmd/putmerge.data时,报错如下:
Exception in thread "main" java.lang.ClassNotFoundException...
只需改成如下命令即可正常执行:
hadoop jar PutMerge.jar PutMerge ./data /dw_ext/recmd/putmerge.data