大数据学习(六):HDFS实现wordcount计数

准备数据

Apache Spark, Spark, Apache, the Apache feather logo,
and the Apache Spark project logo are either registered trademarks or trademarks of The Apache Software Foundation 
in the United States and other countries. See guidance on use of Apache Spark trademarks. 
All other marks mentioned may be trademarks or registered trademarks of their respective owners. 
Copyright © 2018 The Apache Software Foundation, Licensed under the Apache License, Version 2.0.

使用hadoop fs -put 将数据上传至/wordcount/input目录

代码

package com.bigdata.hdfs.wordcount;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;

public class HDFSWordCount {
	
	public static void main(String[] args) throws Exception{
		/**
		 * 处理数据
		 */
		FileSystem fs = FileSystem.get(new URI("hdfs://spark1:9000"), new Configuration(), "root");
		//计数文件所在位置
		Path input = new Path("/wordcount/input/");
		//循环遍历计数文件,并进行计算:第一个参数是服务器路径,第二个参数是否递归
		RemoteIterator<LocatedFileStatus> iter = fs.listFiles(input, false);
		
		Context context = new Context();
		
		while(iter.hasNext()) {
			LocatedFileStatus file = iter.next();
			FSDataInputStream in = fs.open(file.getPath());
			// 逐行读取,文件内容
			BufferedReader br = new BufferedReader(new InputStreamReader(in));
			String line = null;
			while ((line = br.readLine()) != null) {
				// 调用一个方法对每一行进行业务处理
				WordCountUtil.map(line, context);
				
			}
			
			br.close();
			in.close();
		}
		
		
		//输出结果
		Map<Object,Object> contextMap = context.getContextMap();
		
		//HDFS输出路径
		Path output = new Path("/wordcount/output/");
		//路径不存在 创建文件夹
		if(!fs.isFile(output)) {
			fs.mkdirs(output);
		}
		
		FSDataOutputStream out = fs.create(new Path(output,new Path("res.dat")), true);
		Set<Entry<Object, Object>> entrySet = contextMap.entrySet();
		//写入
		for (Entry<Object, Object> entry : entrySet) {
			out.write((entry.getKey().toString()+"\t"+entry.getValue()+"\n").getBytes());
		}
		
		out.close();
		fs.close();
		System.out.println("恭喜!数据统计完成......");
	}

}

工具类

package com.bigdata.hdfs.wordcount;
/**
 * 计数工具类
 * @author 90669
 *
 */
public class WordCountUtil {

	/**
	 * 计数
	 * @param line
	 * @param context
	 */
	public static void map(String line,Context context) {
		String [] words = line.split(" ");
		for (String word : words) {
			Object value = context.get(word);
			if(null==value) {
				context.write(word, 1);
			}else {
				int count = (int)value;
				context.write(word, count+1);
			}
		}
	}
}

package com.bigdata.hdfs.wordcount;

import java.util.HashMap;
import java.util.Map;

public class Context {
	
	private Map<Object,Object> contextMap = new HashMap<>();
	
	public void write(Object key,Object value) {
		contextMap.put(key, value);
	}
	
	public Object get(Object key) {
		return contextMap.get(key);
	}
	
	public Map<Object,Object> getContextMap(){
		return contextMap;
	}

}

计算结果

guidance	1
Software	2
other	2
marks	1
mentioned	1
use	1
All	1
Apache,	1
feather	1
trademarks.	1
2.0.	1
Apache	7
of	3
The	2
are	1
License,	1
respective	1
Licensed	1
trademarks	4
Foundation	1
Copyright	1
on	1
logo,	1
Version	1
countries.	1
be	1
2018	1
owners.	1
logo	1
©	1
may	1
Spark	2
or	2
under	1
the	4
Spark,	2
See	1
in	1
States	1
and	2
project	1
either	1
registered	2
Foundation,	1
their	1
United	1

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值