在分布式文件系统下存放着txt文件,通过Java程序遍历HDFS中txt文件,统计出出现过的单词频率
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.fs.FSDataInputStream;
public class HDFSApi {
//传入一个map参数,将统计到的单词和个数以键值对方式存储,赋值给map
public static void lsDir(Configuration conf, String remoteDir, Map map)
throws IOException {
FileSystem fs = FileSystem.get(conf);
Path dirPath = new Path(remoteDir);
String pattern = ".+\\.txt";
String content = null;
int n = 0;
RemoteIterator<LocatedFileStatus> remoteIterator = fs.listFiles(
dirPath, true);
while (remoteIterator.hasNext()) {
FileStatus s = remoteIterator.next();
System.out.print("lujin:" + s.getPath().toString() + "\n");
System.out.print("name:" + s.getPath().getName() + "\n");
if (Pattern.matches(pattern, s.getPath().getName().toString())) {
FSDataInputStream getIt = fs.open(s.getPath());
BufferedReader d = new BufferedReader(new InputStreamReader(
getIt));
while ((content = d.readLine()) != null) {
String[] arr = content.split(" ");
for (int i = 0; i < arr.length; i++) {
if (map.containsKey(arr[i])) {
n = Integer.parseInt((String) map.get(arr[i])) + 1;
map.put(arr[i], n + "");
} else {
map.put(arr[i], "1");
}
}
}
}
}
fs.close();
}
public static void main(String[] args) {
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://localhost:9000");
String remoteDir = "/user/hadoop/testFor1";
String writeToFilePath = "/user/hadoop/test/cz.txt";
Map<String, String> map = null;
map = new HashMap<String, String>();
try {
HDFSApi.lsDir(conf, remoteDir, map);
FileSystem fs = FileSystem.get(conf);
Path writePath = new Path(writeToFilePath);
FSDataOutputStream out = fs.append(writePath);
Iterator iter = map.entrySet().iterator();
//此处为写入操作,将map中的值写入到HDFS中的一个文本中
while (iter.hasNext()) { Map.Entry entry = (Map.Entry) iter.next(); Object key = entry.getKey(); Object val = entry.getValue(); System.out.println(key + ":" + val); String str = key.toString() + ":" + val.toString() + "\r\n"; out.write(str.getBytes()); out.flush(); } out.close(); fs.close(); } catch (Exception e) { e.printStackTrace(); } } }遍历的文件为testFor1里面的文件
运行后,查看写入的文件
第一次写博客,有什么问题都可以提出来