需求
用Trident实现汇总型PV统计;
1:按天
2:累计
FixedBatchSpout发送若干个tuple,每个tuple是一个字符串,字符串格式是 hosts + "\t" + sessionId + "\t" + times,其中times是日期 + 时间。每个batch最多包含3个tuple,不设置为持续发送。DRPC 函数名称 getPV,execute第二个参数是日期. 可以查询多天,但日期之间要用空格隔开;
(1)MySplit类,继承 BaseFunction;传入的pattern作为分割符号;传入的字符串,按照pattern分割,发射2个字段,date,sessionId。其中date只含有日期。
package trident.function;
import Util.DateFmt;
import backtype.storm.tuple.Values;
import storm.trident.operation.BaseFunction;
import storm.trident.operation.TridentCollector;
import storm.trident.tuple.TridentTuple;
public class MySplit extends BaseFunction{
private static final long serialVersionUID = 1L;
String pattern = null;
public MySplit(String pattern) {
this.pattern = pattern;
}
@Override
public void execute(TridentTuple tuple, TridentCollector collector) {
// 接收到每一行log后做处理,用pattern隔开,这里的是用 "/t" 隔开
String log = tuple.getString(0);
String logArr[] = log.split(pattern);
if (3 == logArr.length) {
// 发射的2个field分别是,date,sessionId
collector.emit(new Values(DateFmt.getCountDate(logArr[2], DateFmt.DATE_SHORT), logArr[1]));
}
}
}
(2)Split类,继承 BaseFunction;传入的pattern作为分割符号;传入的字符串,按照pattern分割,分割后的字符串逐个发送;
package trident.function;
import backtype.storm.tuple.Values;
import storm.trident.operation.BaseFunction;
import storm.trident.operation.TridentCollector;
import storm.trident.tuple.TridentTuple;
public class Split extends BaseFunction {
private static final long serialVersionUID = 1L;
String pattern = null;
public Split(String pattern) {
this.pattern = pattern;
}
@Override
public void execute(TridentTuple tuple, TridentCollector collector) {
String sentence = tuple.getString(0);
for (String word : sentence.split(pattern)) {
collector.emit(new Values(word));
}
}
}
(3)FixedBatchSpout作为数据源;TridentState存储spout处理完后的结果;用drpc发起查询;查询的是 "2019-09-15 2019-09-16" 这两个每天有多少PV;
package trident;
import java.util.Random;
import backtype.storm.Config;
import backtype.storm.LocalCluster;
import backtype.storm.LocalDRPC;
import backtype.storm.StormSubmitter;
import backtype.storm.generated.StormTopology;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Values;
import storm.trident.TridentState;
import storm.trident.TridentTopology;
import storm.trident.operation.builtin.Count;
import storm.trident.operation.builtin.FilterNull;
import storm.trident.operation.builtin.MapGet;
import storm.trident.testing.FixedBatchSpout;
import storm.trident.testing.MemoryMapState;
import trident.function.MySplit;
import trident.function.Split;
public class TridentPVTopo {
public static StormTopology buildTopology(LocalDRPC drpc) {
Random random = new Random();
// 登录的网站是taobao
String hosts = "www.taobao.com";
// 每次登录的session id
String[] sessionId = { "5GFBAT3D3100A7A7255027A70", "5X16BCA8823AC4BD9CD196A5D", "5CFBA5BD76BACF436ACA9DCC8",
"5D16C3E0209C16DEAA28L1824", "5I16CB309251CCF6CE6223BA1", "5C16BC4MB91B85661FE22F413",
"5D16C1F5191CF9371Y32B58CF", "5D16C7A886E2P2AE3EA29FC3E", "5C3FBA728FD7D264B80769B23",
"5B16C0F7215109AG43528BA2D", "5N16C2FE51E5619C2A1244215", "5D16C1EB1C7A751AE03201C3F" };
// 登录的时间
String[] times = { "2019-09-15 08:01:36", "2019-09-15 08:11:37", "2019-09-15 08:31:38", "2019-09-15 09:23:07",
"2019-09-15 10:51:27", "2019-09-15 10:51:56", "2019-09-15 11:01:07", "2019-09-15 11:01:20",
"2019-09-16 11:45:30", "2019-09-16 12:31:49", "2019-09-16 12:41:51", "2019-09-16 12:51:37",
"2019-09-16 13:11:27", "2019-09-16 13:20:40", "2019-09-16 13:31:38"};
// spout,数据源是若干个tuple,每个tuple是一行语句,每个batch包含的tuple最多为3个
FixedBatchSpout spout = new FixedBatchSpout(new Fields("eachLog"), 3,
new Values(hosts + "\t" + sessionId[random.nextInt(12)] + "\t" + times[random.nextInt(15)]),
new Values(hosts + "\t" + sessionId[random.nextInt(12)] + "\t" + times[random.nextInt(15)]),
new Values(hosts + "\t" + sessionId[random.nextInt(12)] + "\t" + times[random.nextInt(15)]),
new Values(hosts + "\t" + sessionId[random.nextInt(12)] + "\t" + times[random.nextInt(15)]),
new Values(hosts + "\t" + sessionId[random.nextInt(12)] + "\t" + times[random.nextInt(15)]),
new Values(hosts + "\t" + sessionId[random.nextInt(12)] + "\t" + times[random.nextInt(15)]),
new Values(hosts + "\t" + sessionId[random.nextInt(12)] + "\t" + times[random.nextInt(15)]),
new Values(hosts + "\t" + sessionId[random.nextInt(12)] + "\t" + times[random.nextInt(15)]));
// 如果设置为true,spout会持续发送数据
spout.setCycle(false);
TridentTopology topology = new TridentTopology();
// 把spout当作数据源
// 最终结果存储在 TridentState
TridentState pvCounts = topology.newStream("spout1", spout)
// .parallelismHint(16) // 为了方便观察,注释并发度
.each(new Fields("eachLog"), new MySplit("\t"), new Fields("date", "sessionId"))
.groupBy(new Fields("date")) // 按日期group by
// 持久化到MemoryMapState,存在内存中的map。也可以存到第三方存储。state是trident的核心概念
.persistentAggregate(new MemoryMapState.Factory(), new Fields("sessionId"), new Count(), new Fields("PV"));
// .parallelismHint(16); // 为了方便观察,注释并发度
// 分布式查询
topology.newDRPCStream("getPV", drpc)
.each(new Fields("args"), new Split(" "), new Fields("date"))
.groupBy(new Fields("date")) // 按日期分组,如果传入的args参数,日期一样,有去重作用
.stateQuery(pvCounts, new Fields("date"), new MapGet(), new Fields("PV"))
.each(new Fields("PV"), new FilterNull()); // 过滤为空的
return topology.build();
}
public static void main(String[] args) throws Exception {
Config conf = new Config();
conf.setMaxSpoutPending(20);
// 如果没有参数,本地模式提交
if (args.length == 0) {
LocalDRPC drpc = new LocalDRPC();
LocalCluster cluster = new LocalCluster();
cluster.submitTopology("PVCounter", conf, buildTopology(drpc));
// 查询100次,每次等待1秒
for (int i = 0; i < 100; i++) {
// 第一个是 DRPC 函数名称 getPV,第二个是参数,这里的参数是日期. 可以查询多天,但日期之间要用空格隔开
System.err.println("DRPC RESULT: " + drpc.execute("getPV", "2019-09-15 2019-09-16"));
Thread.sleep(1000);
}
} else { // 分布式模式提交
conf.setNumWorkers(3);
StormSubmitter.submitTopology(args[0], conf, buildTopology(null));
}
}
}
打印出的日志,2019-09-15有2个PV,2019-09-16有6个PV;
DRPC RESULT: [["2019-09-15 2019-09-16","2019-09-15",2],["2019-09-15 2019-09-16","2019-09-16",6]]