1. 示例需求
解决在微博、论坛进行转发帖子的时候, 是对URL进行转发,分析给粉丝(关注我的人),那么每一个人的粉丝(关注者可能会有重复的情况),需求就是统计一下帖子(URL)的转发人数
2. 解决方案
实现步骤如下:
第一:获取当前转发帖子的人。 GetTweetersBolt
第二:获取当前人的粉丝(关注者)。GetFollowersBolt
第三:进行粉丝去重。PartialUniquerBatchBolt
第四:统计人数。CountAggregatorBatchBlot
第五:最后使用DRPC远程调用 Topology返回执行结果 TiwtterReachTopology
3. 代码示例
TiwtterReachTopology
package com.john.learn.storm.drpc.reach;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.storm.Config;
import org.apache.storm.LocalCluster;
import org.apache.storm.LocalDRPC;
import org.apache.storm.StormSubmitter;
import org.apache.storm.drpc.LinearDRPCTopologyBuilder;
import org.apache.storm.generated.AlreadyAliveException;
import org.apache.storm.generated.AuthorizationException;
import org.apache.storm.generated.InvalidTopologyException;
import org.apache.storm.logging.ThriftAccessLogger;
import org.apache.storm.tuple.Fields;
import com.john.learn.storm.drpc.reach.bolt.CountAggregatorBatchBlot;
import com.john.learn.storm.drpc.reach.bolt.GetFollowersBolt;
import com.john.learn.storm.drpc.reach.bolt.GetTweetersBolt;
import com.john.learn.storm.drpc.reach.bolt.PartialUniquerBatchBolt;
import clojure.main;
public class TiwtterReachTopology {
public static Map<String, String> URLS_DB = new HashMap<>();
// 模拟Who转发了URL
public static Map<String, List<String>> TWEETERS_DB = new HashMap<String, List<String>>();
// 模拟转发人员的粉丝人员
public static Map<String, List<String>> FOLLOWERS_DB = new HashMap<String, List<String>>();
/**
* 初始化数据, 实际数据应该保存在Database / Mongon / Redis, 这里仅仅模拟学习
*/
static {
URLS_DB.put("1", "www.tiwtter.com");
URLS_DB.put("2", "engineering.twitter.com/blog/5");
URLS_DB.put("3", "tech.backtype.com/blog/123");
TWEETERS_DB.put("1", Arrays.asList("sally", "bob", "tim", "george", "nathan"));
TWEETERS_DB.put("2", Arrays.asList("adam", "david", "sally", "nathan"));
TWEETERS_DB.put("3", Arrays.asList("tim", "mike", "john"));
FOLLOWERS_DB.put("george", Arrays.asList("alice","adam", "jim", "chris", "jai"));
FOLLOWERS_DB.put("sally", Arrays.asList("bob", "tim", "alice", "adam", "jim", "chris", "jai"));
FOLLOWERS_DB.put("bob", Arrays.asList("tim", "nathan", "jim", "mary", "david", "vivian"));
FOLLOWERS_DB.put("tim", Arrays.asList("alex"));
FOLLOWERS_DB.put("nathan",
Arrays.asList("sally", "bob", "adam", "harry", "chris", "vivian", "emily", "jordan"));
FOLLOWERS_DB.put("adam", Arrays.asList("david", "carissa"));
FOLLOWERS_DB.put("mike", Arrays.asList("john", "bob"));
FOLLOWERS_DB.put("john", Arrays.asList("alice", "nathan", "jim", "mike", "bob"));
}
public TiwtterReachTopology(String functionName) {
config = new Config();
linearDRPCTopologyBuilder = new LinearDRPCTopologyBuilder(functionName);
linearDRPCTopologyBuilder.addBolt(new GetTweetersBolt(), 3);
linearDRPCTopologyBuilder.addBolt(new GetFollowersBolt(), 12).shuffleGrouping();
linearDRPCTopologyBuilder.addBolt(new PartialUniquerBatchBolt(), 6)
.fieldsGrouping(new Fields("Id", "Follower"));
linearDRPCTopologyBuilder.addBolt(new CountAggregatorBatchBlot(), 3).fieldsGrouping(new Fields("Id"));
}
public LocalDRPC submitLocal() {
LocalDRPC drpc = new LocalDRPC();
LocalCluster cluster = new LocalCluster();
cluster.submitTopology("TiwtterReachTopology", config, linearDRPCTopologyBuilder.createLocalTopology(drpc));
return drpc;
}
public void submitRemote() {
config.setNumWorkers(3);
// Server Mode
try {
StormSubmitter.submitTopology("TiwtterReachTopology", config,
linearDRPCTopologyBuilder.createRemoteTopology());
} catch (Exception e) {
throw new RuntimeException(e);
}
}
private LinearDRPCTopologyBuilder linearDRPCTopologyBuilder;
private Config config;
public static void main(String[] args) {
TiwtterReachTopology tiwtterReachTopology = new TiwtterReachTopology("TiwtterReach");
if (args == null || args.length == 0) {
LocalDRPC localDRPC = tiwtterReachTopology.submitLocal();
System.out.println("Please input a free flag to start reach function");
System.out.println("Reach Count :" + localDRPC.execute("TiwtterReach", "foo.com/blog/1"));
System.exit(1);
}
tiwtterReachTopology.submitRemote();
}
}
注意: 如下Code 中 分组非常重要。
linearDRPCTopologyBuilder = new LinearDRPCTopologyBuilder(functionName);
linearDRPCTopologyBuilder.addBolt(new GetTweetersBolt(), 3);linearDRPCTopologyBuilder.addBolt(new GetFollowersBolt(), 12).shuffleGrouping();
linearDRPCTopologyBuilder.addBolt(new PartialUniquerBatchBolt(), 6)
.fieldsGrouping(new Fields("Id", "Follower"));
linearDRPCTopologyBuilder.addBolt(new CountAggregatorBatchBlot(), 3).fieldsGrouping(new Fields("Id"));
问题: 对于红色部分,如果将fieldsGrouping(new Fields("Id", "Follower")) 修改成 fieldsGrouping(new Fields("Id")) 可以么?
不可以,导致所有ID 都在一个Blot Executor 中计算,由于 Followers Set 保存所有Follower完成去重, 导致OOM风险,同时也没有充分利用分布式。如果系统注册人员海量,Followers也会海量,必须增加PartialUniquerBatchBolt 的 并行度增加计算速度,降低OOM风险。
Bolt
GetTweetersBolt
package com.john.learn.storm.drpc.reach.bolt;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
import com.john.learn.storm.drpc.reach.TiwtterReachTopology;
public class GetTweetersBolt extends BaseRichBolt {
/**
*
*/
private static final long serialVersionUID = 1L;
@Override
public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
this.collector = collector;
}
@Override
public void execute(Tuple input) {
try {
Object requestId = input.getValue(0);
String url = input.getString(1);
System.out.println("GetTweetersBolt URL:" + url);
String urlId = getUrlId(url);
System.out.println("GetTweetersBolt urlId:" + urlId);
if (urlId == null) {
return;
}
List<String> tweetters = TiwtterReachTopology.TWEETERS_DB.get(urlId);
System.out.println("GetTweetersBolt tweetters:" + tweetters);
if (tweetters == null) {
return;
}
for (String teewtter : tweetters) {
//发送消息必须携带 Tuple,用于异常处理
collector.emit(input, new Values(requestId, teewtter));
}
} finally {
//如果使用BaseRichBolt 必须手动提交Ack / Fail
//如果BaseBasicBolt,ack 自动调用的,这里必须注意,否则导致 Bolt 如果return 时,
//下一个Blot无法获取数据
collector.ack(input);
}
}
private String getUrlId(String url) {
for (Map.Entry<String, String> entry : TiwtterReachTopology.URLS_DB.entrySet()) {
if (entry.getValue().equalsIgnoreCase(url)) {
return entry.getKey();
}
}
return null;
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("Id", "Tweetter"));
}
private OutputCollector collector;
}
注意:这里采用BaseRichBlot,必须手动调用ack或fail 方法,否则 当URL不存在时直接return (每个emit任何数据),导致Topology流程堵塞,无法启动下个Bolt-GetFollowersBolt任务处理,导致DRPC超时,务必小心使用BaseRichBlot
但如果我们使用BaseBasicBolt,不需要ack了,因为execute方法后,自动操作ack,这就是与BaseRichBlot最大区别。
GetFollowersBolt
package com.john.learn.storm.drpc.reach.bolt;
import java.util.List;
import java.util.Map;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
import com.john.learn.storm.drpc.reach.TiwtterReachTopology;
public class GetFollowersBolt extends BaseRichBolt {
/**
*
*/
private static final long serialVersionUID = 1L;
@Override
public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
this.collector = collector;
}
@Override
public void execute(Tuple input) {
try {
Long requestId = input.getLong(0);
String tweeter = input.getString(1);
List<String> followers = TiwtterReachTopology.FOLLOWERS_DB.get(tweeter);
if (followers == null || followers.isEmpty()) {
return;
}
for (String follower : followers) {
//发送消息必须携带 Tuple,用于异常处理
collector.emit(input, new Values(requestId, follower));
}
} finally {
// 如果使用BaseRichBolt 必须手动提交Ack / Fail
// 如果BaseBasicBolt,ack 自动调用的,这里必须注意,否则导致 Bolt 如果return 时,
// 下一个Blot无法获取数据 ,同时 finishBatch 无法知道处理完毕
collector.ack(input);
}
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("Id", "Follower"));
}
private OutputCollector collector;
}
注意:这里采用BaseRichBlot,必须手动调用ack或fail 方法,否则PartialUniquerBatchBolt 无法知道Id已经处理完毕,导致无法调用finishBatch。
PartialUniquerBatchBolt
package com.john.learn.storm.drpc.reach.bolt;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import org.apache.storm.coordination.BatchOutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.BasicOutputCollector;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseBasicBolt;
import org.apache.storm.topology.base.BaseBatchBolt;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
public class PartialUniquerBatchBolt extends BaseBatchBolt<Object> {
private static final long serialVersionUID = 1L;
@Override
public void prepare(Map conf, TopologyContext context, BatchOutputCollector collector, Object id) {
this.requestId = id;
this.collector = collector;
}
@Override
public void execute(Tuple tuple) {
String follower = tuple.getString(1);
// 已经完成统计记录,不需统计了
if (followers.contains(follower)) {
return;
}
followers.add(tuple.getString(1));
}
@Override
public void finishBatch() {
collector.emit(new Values(requestId, followers.size()));
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("Id", "FollowerCount"));
}
private BatchOutputCollector collector;
// 保存Reach URL的到达者,避免重复
private Set<String> followers = new HashSet<>();
private Object requestId;
}
CountAggregatorBatchBlot
package com.john.learn.storm.drpc.reach.bolt;
import java.util.Map;
import org.apache.storm.coordination.BatchOutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseBatchBolt;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
public class CountAggregatorBatchBlot extends BaseBatchBolt<Object> {
/**
*
*/
private static final long serialVersionUID = 1L;
@Override
public void prepare(Map conf, TopologyContext context, BatchOutputCollector collector, Object id) {
this.requestId = id;
this.collector = collector;
}
@Override
public void execute(Tuple tuple) {
reachCount += tuple.getInteger(1);
}
@Override
public void finishBatch() {
collector.emit(new Values(this.requestId, this.reachCount));
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("Id", "ReachCount"));
}
private Object requestId;
private BatchOutputCollector collector;
private int reachCount;
}
运行结果:
public static void main(String[] args) {
TiwtterReachTopology tiwtterReachTopology = new TiwtterReachTopology("TiwtterReach");
if (args == null || args.length == 0) {
LocalDRPC localDRPC = tiwtterReachTopology.submitLocal();
System.out.println("Please input a free flag to start reach function");
for (String id : URLS_DB.keySet()) {
System.out.println("The URL " + URLS_DB.get(id) + "Reach Count :"
+ localDRPC.execute("TiwtterReach", URLS_DB.get(id)));
}
System.exit(1);
return;
}
tiwtterReachTopology.submitRemote();
}