[size=large][color=red][b]WordCount实现 [/b][/color][/size]
-- ① 加载数据
a= load '/input/data' as (line:chararray);
-- ② 将字符串分割成单词
b= foreach a generate flatten(TOKENIZE(line)) as word;
-- ③ 对单词进行分组
c= group words by word;
-- ④ 统计每组中单词数量
d= foreach c generate group,COUNT(b);
-- ⑤ 打印结果
dump cntd;
[b]foreach a generate group :[/b]对c中所有key进行分组
a = (a,{})(b,{})(b,{}):foreach a generate group =>得到(a,b,c)
--------------------------------------------------------------
line = (lin lin lin)
[b]TOKENIZE:[/b]foreach a generate TOKENIZE(line,'') =>({(lin),(lin),(lin)})
[b]flatten:[/b]会把集合的内容打开然后组合一个元组; ({(lin),(lin),(lin)}) =>(lin)(lin)(lin)
-- ① 加载数据
a= load '/input/data' as (line:chararray);
-- ② 将字符串分割成单词
b= foreach a generate flatten(TOKENIZE(line)) as word;
-- ③ 对单词进行分组
c= group words by word;
-- ④ 统计每组中单词数量
d= foreach c generate group,COUNT(b);
-- ⑤ 打印结果
dump cntd;
[b]foreach a generate group :[/b]对c中所有key进行分组
a = (a,{})(b,{})(b,{}):foreach a generate group =>得到(a,b,c)
--------------------------------------------------------------
line = (lin lin lin)
[b]TOKENIZE:[/b]foreach a generate TOKENIZE(line,'') =>({(lin),(lin),(lin)})
[b]flatten:[/b]会把集合的内容打开然后组合一个元组; ({(lin),(lin),(lin)}) =>(lin)(lin)(lin)