Java统计HDFS中txt文件的词频

在分布式文件系统下存放着txt文件,通过Java程序遍历HDFS中txt文件,统计出出现过的单词频率

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.regex.Pattern;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.fs.FSDataInputStream;

public class HDFSApi {
//传入一个map参数,将统计到的单词和个数以键值对方式存储,赋值给map
public static void lsDir(Configuration conf, String remoteDir, Map map)    
			throws IOException {
		FileSystem fs = FileSystem.get(conf);
		Path dirPath = new Path(remoteDir);

		String pattern = ".+\\.txt";
		String content = null;
		int n = 0;

		RemoteIterator<LocatedFileStatus> remoteIterator = fs.listFiles(
				dirPath, true);
		while (remoteIterator.hasNext()) {
			FileStatus s = remoteIterator.next();
			System.out.print("lujin:" + s.getPath().toString() + "\n");
			System.out.print("name:" + s.getPath().getName() + "\n");
			if (Pattern.matches(pattern, s.getPath().getName().toString())) {

				FSDataInputStream getIt = fs.open(s.getPath());
				BufferedReader d = new BufferedReader(new InputStreamReader(
						getIt));
				while ((content = d.readLine()) != null) {
					String[] arr = content.split(" ");

					for (int i = 0; i < arr.length; i++) {
						if (map.containsKey(arr[i])) {
							n = Integer.parseInt((String) map.get(arr[i])) + 1;
							map.put(arr[i], n + "");
						} else {
							map.put(arr[i], "1");

						}
					}
				}
			}
		}

		fs.close();
	}

	public static void main(String[] args) {
		Configuration conf = new Configuration();
		conf.set("fs.defaultFS", "hdfs://localhost:9000");
		String remoteDir = "/user/hadoop/testFor1";
		String writeToFilePath = "/user/hadoop/test/cz.txt";

		Map<String, String> map = null;
		map = new HashMap<String, String>();

		try {

			HDFSApi.lsDir(conf, remoteDir, map);
			FileSystem fs = FileSystem.get(conf);
			Path writePath = new Path(writeToFilePath);
			FSDataOutputStream out = fs.append(writePath);

			Iterator iter = map.entrySet().iterator();
//此处为写入操作,将map中的值写入到HDFS中的一个文本中
while (iter.hasNext()) { Map.Entry entry = (Map.Entry) iter.next(); Object key = entry.getKey(); Object val = entry.getValue(); System.out.println(key + ":" + val); String str = key.toString() + ":" + val.toString() + "\r\n"; out.write(str.getBytes()); out.flush(); } out.close(); fs.close(); } catch (Exception e) { e.printStackTrace(); } } }

遍历的文件为testFor1里面的文件


运行后,查看写入的文件


第一次写博客,有什么问题都可以提出来

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值