demo1 对童谣“Mary 和她的羊羔”进行词频统计。
--从HDFS 中加载文件
records = load '/tmp/dyf/content' as (line);
--TOKENIZE 将line 按单词分割成列
--flatten 接受 TOKENIZE 操作后产生的记录集合,然后分开成独立的列。
words = foreach records generate flatten(TOKENIZE(line)) as word;
--按照 word 进行分组
grpd = group words by word;
--计数
cntd = foreach grpd generate group,COUNT(words);
--打印结果
DUMP cntd;
童谣内容
Mary had a little lamb
its fleece was white as snow
and everywhere that Mary went
its fleece was white as snow
and everywhere that Mary went
the lamb was sure to go
打印结果:
(a,1)
(as,1)
(go,1)
(to,1)
(and,1)
(had,1)
(its,1)
(the,1)
(was,2)
(Mary,2)
(lamb,2)
(snow,1)
(sure,1)
(that,1)
(went,1)
(white,1)
(fleece,1)
(as,1)
(go,1)
(to,1)
(and,1)
(had,1)
(its,1)
(the,1)
(was,2)
(Mary,2)
(lamb,2)
(snow,1)
(sure,1)
(that,1)
(went,1)
(white,1)
(fleece,1)
(little,1)
(everywhere,1)
demo2: 统计用户购买总金额,join 邮编。
transactions = load '/tmp/dyf/transactions' as (customer,purchase);
grouped = group transactions by customer;
total = foreach grouped generate group, SUM(transactions.purchase) as total_purchase;
profile = load '/tmp/dyf/customer_profile' as (customer,zipcode);
answer = join total by group,profile by customer;
dump answer;
transactions 数据:
zwj 36
dyf 120
dzh 1000
dyf 66
zwj 88
dyf 120
dzh 1000
dyf 66
zwj 88
dzh 99
customer_profile 数据:
dyf 456373
dzh 458000
dzh 458000
zwj 458000
打印结果:
(dyf,186.0,dyf,456373)
(dzh,1099.0,dzh,458000)
(dzh,1099.0,dzh,458000)
(zwj,124.0,zwj,458000)
demo3:查找年龄为18~25岁的用户访问最多的5个网站。
users = load '/tmp/dyf/users' as (name,age);
— 过滤18~25 用户
fltrd = filter users by age >= 18 and age <= 25;
pages = load '/tmp/dyf/pages' as(user,url);
— 过滤18~25 用户访问网站
jnd = join fltrd by name,pages by user;
grpd = group jnd by url;
smmd = foreach grpd generate group,COUNT(jnd) as clicks;
srtd = order smmd by clicks desc;
top5 = limit srtd 5;
dump top5;
users 数据:
dyf 24
zwj 22
zwj 22
dzh 2
pages 数据:
dyf baidu
dyf google
dyf xueqiu
zwj baobao
zwj baidu
zwj google
dzh baobao
dyf google
dyf xueqiu
zwj baobao
zwj baidu
zwj google
dzh baobao
dzh taobao
打印结果:
(baidu,2)
(google,2)
(baobao,1)
(xueqiu,1)
(google,2)
(baobao,1)
(xueqiu,1)