1.利用dask分布式,首先搭建一个dask分布式系统;
2.搭建一个HDFS分布式文件系统,因为对于分布式系统来说,由于计算能力分散,最好的就是搭建一个局域网文件系统,避免本地数据的频繁分发传递;
3.代码如下所示:
#dask 利用hdfs单词计数
from hdfs import Client
from distributed import Client as Cl,progress
from collections import defaultdict,Counter
hdfs = Client('http://192.168.175.139:9870') #连接hdfs
print(hdfs.list('/')) #查看hdfs文件系统根目录下的文件
client = Cl('172.26.244.71:8786') #连接到dask分布式
print(client.ncores) #查看dask分布式运算资源情况
filenames = hdfs.list('/test/input1')
print(filenames) #输入文件名组成的列表
def count_words(fn):
"""
单词计数功能
"""
hdfs = Client('http://192.168.175.139:9870') # 连接hdfs
fn = '/test/input1/' + fn
word_counts = defaultdict(int)
with hdfs.read(fn) as f:
for line in f.readlines():
for word in line.split():
word_counts[word] += 1
return word_counts
# counts = count_words(filenames[0])
# print(counts)
future = client.submit(count_words,filenames[0])
counts = future.result()
print(counts)
futures = client.map(count_words,filenames)
print(len(futures))
print(futures[:])
print(progress(futures)) #显示分布式程序运行时间和百分比
print(futures[0].result())
def top_items(d):
"""
返回文本中排序之后的单词
:param d:
:return:
"""
items = sorted(d.items(),key = lambda x:x[1],reverse=True)[::]
return dict(items)
futures2 = client.map(top_items,futures)
results = client.gather((iter(futures2))) #计算数据发送到本地
print(results)
print(iter(futures2))
all_counts = Counter()
for result in results:
print(result)
all_counts.update(result)
print(len(all_counts))
print(sorted(all_counts.items(),key=lambda x:x[1],reverse=True)[::])
print('end')
由于初学,在代码中出现了各种各样的问题,耐心点一步步来,一行行的去调试。