使用pig对钓鱼网站链接url做词频统计(大数据处理与分析技术)
数据展示:(verified_online.csv)
grunt> A = load 'verified_online.csv' using PigStorage(',') AS (phish_id:chararray, url:chararray, phish_detail_url:chararray, submission_time:chararray, verified:chararray, verification_time:chararray, online:chararray, target:chararray);
grunt> B = FILTER A BY target == 'targetWord';
grunt> C = FOREACH B GENERATE REPLACE(target, '"', ''), $0,REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE($1, ':', ','), '//', ','), ',,', ','), '\\.', ','), '\\/', ','), '\\?', ','), '\\=', ',') as (line);
grunt> D = FOREACH C GENERATE flatten(TOKENIZE(line)) as word;
grunt> E = GROUP D BY word;
grunt> F = FOREACH E GENERATE group, COUNT(D) AS cnt;
grunt> G = order F by cnt DESC;
grunt> DUMP G;