Storm 从入门到精通 第二十一讲 Storm DRPC 之 官方复杂示例 TiwtterReachTopology

1. 示例需求

解决在微博、论坛进行转发帖子的时候, 是对URL进行转发,分析给粉丝(关注我的人),那么每一个人的粉丝(关注者可能会有重复的情况),需求就是统计一下帖子(URL)的转发人数

2. 解决方案

实现步骤如下:
   第一:获取当前转发帖子的人。 GetTweetersBolt
   第二:获取当前人的粉丝(关注者)。GetFollowersBolt
   第三:进行粉丝去重。PartialUniquerBatchBolt
   第四:统计人数。CountAggregatorBatchBlot
   第五:最后使用DRPC远程调用 Topology返回执行结果 TiwtterReachTopology

3. 代码示例

TiwtterReachTopology

package com.john.learn.storm.drpc.reach;

import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.storm.Config;
import org.apache.storm.LocalCluster;
import org.apache.storm.LocalDRPC;
import org.apache.storm.StormSubmitter;
import org.apache.storm.drpc.LinearDRPCTopologyBuilder;
import org.apache.storm.generated.AlreadyAliveException;
import org.apache.storm.generated.AuthorizationException;
import org.apache.storm.generated.InvalidTopologyException;
import org.apache.storm.logging.ThriftAccessLogger;
import org.apache.storm.tuple.Fields;

import com.john.learn.storm.drpc.reach.bolt.CountAggregatorBatchBlot;
import com.john.learn.storm.drpc.reach.bolt.GetFollowersBolt;
import com.john.learn.storm.drpc.reach.bolt.GetTweetersBolt;
import com.john.learn.storm.drpc.reach.bolt.PartialUniquerBatchBolt;

import clojure.main;

public class TiwtterReachTopology {

  public static Map<String, String> URLS_DB = new HashMap<>();

  // 模拟Who转发了URL
  public static Map<String, List<String>> TWEETERS_DB = new HashMap<String, List<String>>();

  // 模拟转发人员的粉丝人员
  public static Map<String, List<String>> FOLLOWERS_DB = new HashMap<String, List<String>>();

  /**
   * 初始化数据, 实际数据应该保存在Database / Mongon / Redis, 这里仅仅模拟学习
   */
  static {

     URLS_DB.put("1", "www.tiwtter.com");
     URLS_DB.put("2", "engineering.twitter.com/blog/5");
     URLS_DB.put("3", "tech.backtype.com/blog/123");

     TWEETERS_DB.put("1", Arrays.asList("sally", "bob", "tim", "george", "nathan"));
     TWEETERS_DB.put("2", Arrays.asList("adam", "david", "sally", "nathan"));
     TWEETERS_DB.put("3", Arrays.asList("tim", "mike", "john"));

     FOLLOWERS_DB.put("george", Arrays.asList("alice","adam", "jim", "chris", "jai"));
     FOLLOWERS_DB.put("sally", Arrays.asList("bob", "tim", "alice", "adam", "jim", "chris", "jai"));
     FOLLOWERS_DB.put("bob", Arrays.asList("tim", "nathan", "jim", "mary", "david", "vivian"));
     FOLLOWERS_DB.put("tim", Arrays.asList("alex"));
     FOLLOWERS_DB.put("nathan",
          Arrays.asList("sally", "bob", "adam", "harry", "chris", "vivian", "emily", "jordan"));
     FOLLOWERS_DB.put("adam", Arrays.asList("david", "carissa"));
     FOLLOWERS_DB.put("mike", Arrays.asList("john", "bob"));
     FOLLOWERS_DB.put("john", Arrays.asList("alice", "nathan", "jim", "mike", "bob"));

  }

  public TiwtterReachTopology(String functionName) {

     config = new Config();

     linearDRPCTopologyBuilder = new LinearDRPCTopologyBuilder(functionName);
     linearDRPCTopologyBuilder.addBolt(new GetTweetersBolt(), 3);
     linearDRPCTopologyBuilder.addBolt(new GetFollowersBolt(), 12).shuffleGrouping();
     linearDRPCTopologyBuilder.addBolt(new PartialUniquerBatchBolt(), 6)
          .fieldsGrouping(new Fields("Id", "Follower"));
     linearDRPCTopologyBuilder.addBolt(new CountAggregatorBatchBlot(), 3).fieldsGrouping(new Fields("Id"));
  }

  public LocalDRPC submitLocal() {

     LocalDRPC drpc = new LocalDRPC();

     LocalCluster cluster = new LocalCluster();

     cluster.submitTopology("TiwtterReachTopology", config, linearDRPCTopologyBuilder.createLocalTopology(drpc));

     return drpc;

  }

  public void submitRemote() {

     config.setNumWorkers(3);

     // Server Mode
     try {

       StormSubmitter.submitTopology("TiwtterReachTopology", config,
            linearDRPCTopologyBuilder.createRemoteTopology());

     } catch (Exception e) {

       throw new RuntimeException(e);
     }
  }

  private LinearDRPCTopologyBuilder linearDRPCTopologyBuilder;

  private Config config;

  public static void main(String[] args) {

     TiwtterReachTopology tiwtterReachTopology = new TiwtterReachTopology("TiwtterReach");

     if (args == null || args.length == 0) {

       LocalDRPC localDRPC = tiwtterReachTopology.submitLocal();

       System.out.println("Please input a free flag to start reach function");
       
       System.out.println("Reach Count :" + localDRPC.execute("TiwtterReach", "foo.com/blog/1"));

       System.exit(1);

     }
     
     tiwtterReachTopology.submitRemote();
     

  }

}
 

 注意: 如下Code 中 分组非常重要。

linearDRPCTopologyBuilder = new LinearDRPCTopologyBuilder(functionName);

 linearDRPCTopologyBuilder.addBolt(new GetTweetersBolt(), 3);
 linearDRPCTopologyBuilder.addBolt(new GetFollowersBolt(), 12).shuffleGrouping();
 linearDRPCTopologyBuilder.addBolt(new PartialUniquerBatchBolt(), 6)
          .fieldsGrouping(new Fields("Id", "Follower"));

  linearDRPCTopologyBuilder.addBolt(new CountAggregatorBatchBlot(), 3).fieldsGrouping(new Fields("Id"));

 问题: 对于红色部分,如果将fieldsGrouping(new Fields("Id", "Follower")) 修改成 fieldsGrouping(new Fields("Id")) 可以么?

不可以,导致所有ID 都在一个Blot Executor 中计算,由于 Followers Set 保存所有Follower完成去重, 导致OOM风险,同时也没有充分利用分布式。如果系统注册人员海量,Followers也会海量,必须增加PartialUniquerBatchBolt 的 并行度增加计算速度,降低OOM风险。

Bolt

GetTweetersBolt

package com.john.learn.storm.drpc.reach.bolt;

import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;

import com.john.learn.storm.drpc.reach.TiwtterReachTopology;

public class GetTweetersBolt extends BaseRichBolt {

  /**
   * 
   */
  private static final long serialVersionUID = 1L;

  @Override
  public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
     this.collector = collector;
  }

  @Override
  public void execute(Tuple input) {

     try {

       Object requestId = input.getValue(0);
       String url = input.getString(1);

       System.out.println("GetTweetersBolt URL:" + url);

       String urlId = getUrlId(url);

       System.out.println("GetTweetersBolt urlId:" + urlId);

       if (urlId == null) {

          return;
       }

       List<String> tweetters = TiwtterReachTopology.TWEETERS_DB.get(urlId);

       System.out.println("GetTweetersBolt tweetters:" + tweetters);

       if (tweetters == null) {

          return;
       }

       for (String teewtter : tweetters) {
          
          //发送消息必须携带 Tuple,用于异常处理
          collector.emit(input, new Values(requestId, teewtter));
       }

     } finally {
       //如果使用BaseRichBolt 必须手动提交Ack / Fail 
       //如果BaseBasicBolt,ack 自动调用的,这里必须注意,否则导致 Bolt 如果return 时, 
       //下一个Blot无法获取数据 
       collector.ack(input);
     }
  }

  private String getUrlId(String url) {

     for (Map.Entry<String, String> entry : TiwtterReachTopology.URLS_DB.entrySet()) {

       if (entry.getValue().equalsIgnoreCase(url)) {

          return entry.getKey();
       }
     }

     return null;
  }

  @Override
  public void declareOutputFields(OutputFieldsDeclarer declarer) {

     declarer.declare(new Fields("Id", "Tweetter"));
  }

  private OutputCollector collector;

}
 

注意:这里采用BaseRichBlot,必须手动调用ack或fail 方法,否则 当URL不存在时直接return (每个emit任何数据),导致Topology流程堵塞,无法启动下个Bolt-GetFollowersBolt任务处理,导致DRPC超时,务必小心使用BaseRichBlot

但如果我们使用BaseBasicBolt,不需要ack了,因为execute方法后,自动操作ack,这就是与BaseRichBlot最大区别。


GetFollowersBolt

package com.john.learn.storm.drpc.reach.bolt;

import java.util.List;
import java.util.Map;

import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;

import com.john.learn.storm.drpc.reach.TiwtterReachTopology;

public class GetFollowersBolt extends BaseRichBolt {

  /**
   * 
   */
  private static final long serialVersionUID = 1L;

  @Override
  public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {

     this.collector = collector;
  }

  @Override
  public void execute(Tuple input) {

     try {

       Long requestId = input.getLong(0);
       String tweeter = input.getString(1);

       List<String> followers = TiwtterReachTopology.FOLLOWERS_DB.get(tweeter);

       if (followers == null || followers.isEmpty()) {

          return;
       }
       
       for (String follower : followers) {
          
          //发送消息必须携带 Tuple,用于异常处理
          collector.emit(input, new Values(requestId, follower));
       }

     } finally {

       // 如果使用BaseRichBolt 必须手动提交Ack / Fail
       // 如果BaseBasicBolt,ack 自动调用的,这里必须注意,否则导致 Bolt 如果return 时,
       // 下一个Blot无法获取数据 ,同时 finishBatch 无法知道处理完毕
       collector.ack(input);
     }

  }

  @Override
  public void declareOutputFields(OutputFieldsDeclarer declarer) {

     declarer.declare(new Fields("Id", "Follower"));
  }

  private OutputCollector collector;
}

注意:这里采用BaseRichBlot,必须手动调用ack或fail 方法,否则PartialUniquerBatchBolt 无法知道Id已经处理完毕,导致无法调用finishBatch。

PartialUniquerBatchBolt

package com.john.learn.storm.drpc.reach.bolt;

import java.util.HashSet;
import java.util.Map;
import java.util.Set;

import org.apache.storm.coordination.BatchOutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.BasicOutputCollector;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseBasicBolt;
import org.apache.storm.topology.base.BaseBatchBolt;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;

public class PartialUniquerBatchBolt extends BaseBatchBolt<Object> {

  private static final long serialVersionUID = 1L;

  @Override
  public void prepare(Map conf, TopologyContext context, BatchOutputCollector collector, Object id) {
     this.requestId = id;
     this.collector = collector;

  }

  @Override
  public void execute(Tuple tuple) {

     String follower = tuple.getString(1);
     // 已经完成统计记录,不需统计了
     if (followers.contains(follower)) {
       return;
     }
     followers.add(tuple.getString(1));
  }

  @Override
  public void finishBatch() {

     collector.emit(new Values(requestId, followers.size()));

  }

  @Override
  public void declareOutputFields(OutputFieldsDeclarer declarer) {

     declarer.declare(new Fields("Id", "FollowerCount"));
  }

  private BatchOutputCollector collector;

  // 保存Reach URL的到达者,避免重复
  private Set<String> followers = new HashSet<>();

  private Object requestId;

}
 

CountAggregatorBatchBlot

package com.john.learn.storm.drpc.reach.bolt;

import java.util.Map;

import org.apache.storm.coordination.BatchOutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseBatchBolt;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;

public class CountAggregatorBatchBlot extends BaseBatchBolt<Object> {

  /**
   * 
   */
  private static final long serialVersionUID = 1L;

  @Override
  public void prepare(Map conf, TopologyContext context, BatchOutputCollector collector, Object id) {
     this.requestId = id;
     this.collector = collector;
  }

  @Override
  public void execute(Tuple tuple) {

     reachCount += tuple.getInteger(1);
  }

  @Override
  public void finishBatch() {

     collector.emit(new Values(this.requestId, this.reachCount));
  }

  @Override
  public void declareOutputFields(OutputFieldsDeclarer declarer) {

     declarer.declare(new Fields("Id", "ReachCount"));
  }

  private Object requestId;

  private BatchOutputCollector collector;

  private int reachCount;

}
 

运行结果:

public static void main(String[] args) {

		TiwtterReachTopology tiwtterReachTopology = new TiwtterReachTopology("TiwtterReach");

		if (args == null || args.length == 0) {

			LocalDRPC localDRPC = tiwtterReachTopology.submitLocal();

			System.out.println("Please input a free flag to start reach function");

			for (String id : URLS_DB.keySet()) {
				System.out.println("The URL " + URLS_DB.get(id) + "Reach Count :"
						+ localDRPC.execute("TiwtterReach", URLS_DB.get(id)));
			}

			System.exit(1);

			return;
		}

		tiwtterReachTopology.submitRemote();

	}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值