实例场景:
我们在微博、论坛进行转发帖子的时候,是对url进行转发,这个例子就是统计一下帖子(url)转发给粉丝的人数。
主要使用storm的并行计算能力来进行的。
分析:
粉丝(关注我的人),每一个人的粉丝(关注者可能会有重复的情况)
实现步骤如下:
第一,获取当前转发帖子的人
第二,获取当前人的粉丝(关注者)
第三,进行粉丝去重
第四,统计人数
第五,最后使用drpc远程调用topology返回执行结果
ReachTopology
package com.xnmzdx.storm.drpc2.server;
import backtype.storm.Config;
import backtype.storm.LocalCluster;
import backtype.storm.LocalDRPC;
import backtype.storm.StormSubmitter;
import backtype.storm.coordination.BatchOutputCollector;
import backtype.storm.drpc.LinearDRPCTopologyBuilder;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.BasicOutputCollector;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseBasicBolt;
import backtype.storm.topology.base.BaseBatchBolt;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Tuple;
import backtype.storm.tuple.Values;
import java.util.*;
/**
* 注意:传入的参数作为storm DRPC topology的数据源(spout)
* 理解代码思路可以从topology开始
* @author zyt
*
*/
public class ReachTopology {
//手动创造数据(HashMap类型):帖子的url-->转发此url的用户
public static Map<String, List<String>> TWEETERS_DB = new HashMap<String, List<String>>() {
{
put("foo.com/blog/1", Arrays.asList("sally", "bob", "tim", "george", "nathan"));
put("engineering.twitter.com/blog/5", Arrays.asList("adam", "david", "sally", "nathan"));
put("tech.backtype.com/blog/123", Arrays.asList("tim", "mike", "john"));
}
};
//手动创造数据(HashMap类型):用户-->此用户的粉丝
public static Map<String, List<String>> FOLLOWERS_DB = new HashMap<String, List<String>>() {
{
put("sally", Arrays.asList("bob", "tim", "alice", "adam", "jim", "chris", "jai"));
put("bob", Arrays.asList("sally", "nathan", "jim", "mary", "david", "vivian"));
put("tim", Arrays.asList("alex"));
put("nathan", Arrays.asList("sally", "bob", "adam", "harry", "chris", "vivian", "emily", "jordan"));
put("adam", Arrays.asList("david", "carissa"));
put("mike", Arrays.asList("john", "bob"));
put("john", Arrays.asList("alice", "nathan", "jim", "mike", "bob"));
}
};
//编写bolt
public static class GetTweeters extends BaseBasicBolt {
public void execute(Tuple tuple, BasicOutputCollector collector) {
//此bolt的数据输入流就是传入的参数(格式:("reach", "foo.com/blog/1"))
Object id = tuple.getValue(0); //???getValue应该是获取数值的意思,下标为0的数值可能是随机分配的id值
String url = tuple.getString(1);//获取下标为1且为字符串类型 正好是url
List<String> tweeters = TWEETERS_DB.get(url); //获取数据中转发该url的用户
if (tweeters != null) {
for (String tweeter : tweeters) {
collector.emit(new Values(id, tweeter));//将(id,用户)向下发送
}
}
}
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("id", "tweeter"));
}
}
//编写bolt
public static class GetFollowers extends BaseBasicBolt {
public void execute(Tuple tuple, BasicOutputCollector collector) {
//此bolt的数据输入流的格式为:("id", "tweeter")
Object id = tuple.getValue(0);
String tweeter = tuple.getString(1);
List<String> followers = FOLLOWERS_DB.get(tweeter); //获取该用户的粉丝
if (followers != null) {
for (String follower : followers) {
collector.emit(new Values(id, follower)); //将(id,粉丝)向下发送
}
}
}
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("id", "follower"));
}
}
//编写bolt,注意:此bolt继承的是BaseBatchBolt(之前没有用到过)
public static class PartialUniquer extends BaseBatchBolt {
BatchOutputCollector _collector;
Object _id;
Set<String> _followers = new HashSet<String>(); //用于粉丝去重,HashSet中的元素没有下标,无序,不会重复
//初始化定义的属性
public void prepare(Map conf, TopologyContext context, BatchOutputCollector collector, Object id) {
_collector = collector;
_id = id;
}
public void execute(Tuple tuple) {
//此bolt的数据输入流的格式为:("id", "follower")
_followers.add(tuple.getString(1)); //第二个字段是粉丝(follower),HashSet类自动去重
}
public void finishBatch() {
_collector.emit(new Values(_id, _followers.size()));
}
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("id", "partial-count"));
}
}
//编写bolt
public static class CountAggregator extends BaseBatchBolt {
BatchOutputCollector _collector;
Object _id;
int _count = 0;
public void prepare(Map conf, TopologyContext context, BatchOutputCollector collector, Object id) {
_collector = collector;
_id = id;
}
public void execute(Tuple tuple) {
_count += tuple.getInteger(1); //将粉丝数量累加
}
public void finishBatch() {
_collector.emit(new Values(_id, _count));
}
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("id", "reach"));
}
}
public static LinearDRPCTopologyBuilder construct() {
LinearDRPCTopologyBuilder builder = new LinearDRPCTopologyBuilder("reach");//创建DRPC实例,将功能命名为reach
//获取转发过url的人
builder.addBolt(new GetTweeters(), 4);
//获取上面的人的粉丝
builder.addBolt(new GetFollowers(), 12).shuffleGrouping();//GetTweeters按照GetFollowers转发给GetFollowers
//对粉丝进行去重(使用fieldsGrouping方式,对用户名相同的人的粉丝放在一起去重,)
builder.addBolt(new PartialUniquer(), 6).fieldsGrouping(new Fields("id", "follower"));//GetFollowers按字段分组方式转发给PartialUniquer
//最后进行统计人数(使用fieldsGrouping方式,对用户名相同的人的粉丝数量进行累加)
builder.addBolt(new CountAggregator(), 3).fieldsGrouping(new Fields("id"));
return builder;
}
public static void main(String[] args) throws Exception {
LinearDRPCTopologyBuilder builder = construct(); //创建DRPC的topology
Config conf = new Config();
if (args == null || args.length == 0) { //如果没有参数,执行本地模式
conf.setMaxTaskParallelism(3);
LocalDRPC drpc = new LocalDRPC();
LocalCluster cluster = new LocalCluster();
cluster.submitTopology("reach-drpc", conf, builder.createLocalTopology(drpc));
//本地模式的传参
String[] urlsToTry = new String[] { "foo.com/blog/1", "engineering.twitter.com/blog/5", "notaurl.com" };
for (String url : urlsToTry) {
System.out.println("Reach of " + url + ": " + drpc.execute("reach", url));
}
cluster.shutdown();
drpc.shutdown();
} else {//如果有参数,执行集群模式,需要client端的代码配合使用
conf.setNumWorkers(6);
StormSubmitter.submitTopology(args[0], conf, builder.createRemoteTopology());
}
}
}
ReachDrpcClient
package com.xnmzdx.storm.drpc2.client;
import backtype.storm.utils.DRPCClient;
public class ReachDrpcClient {
public static void main(String[] args) throws Exception {
DRPCClient client = new DRPCClient("192.168.100.50", 3772);
System.out.println(client.execute("reach", args[0]));
//其实具体点就是这样传参数System.out.println(client.execute("reach", "foo.com/blog/1"));
}
}
如果在storm集群上运行此程序,则需要将项目打成jar包上传至storm集群上并使用命令运行,然后再运行ReachDrpcClient类
storm jar jar包名称 运行的类的全名称 参数1,参数2,....