Storm项目04(2)——storm DRPC的一个实例应用场景:统计帖子(url)转发给粉丝的人数

实例场景:

我们在微博、论坛进行转发帖子的时候,是对url进行转发,这个例子就是统计一下帖子(url)转发给粉丝的人数。

主要使用storm的并行计算能力来进行的。

分析:

粉丝(关注我的人),每一个人的粉丝(关注者可能会有重复的情况)

实现步骤如下:

第一,获取当前转发帖子的人

第二,获取当前人的粉丝(关注者)

第三,进行粉丝去重

第四,统计人数

第五,最后使用drpc远程调用topology返回执行结果

ReachTopology

package com.xnmzdx.storm.drpc2.server;

import backtype.storm.Config;
import backtype.storm.LocalCluster;
import backtype.storm.LocalDRPC;
import backtype.storm.StormSubmitter;
import backtype.storm.coordination.BatchOutputCollector;
import backtype.storm.drpc.LinearDRPCTopologyBuilder;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.BasicOutputCollector;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseBasicBolt;
import backtype.storm.topology.base.BaseBatchBolt;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Tuple;
import backtype.storm.tuple.Values;

import java.util.*;

/**
 * 注意:传入的参数作为storm DRPC topology的数据源(spout)
 * 理解代码思路可以从topology开始
 * @author zyt
 *
 */
public class ReachTopology {
	
	//手动创造数据(HashMap类型):帖子的url-->转发此url的用户
	public static Map<String, List<String>> TWEETERS_DB = new HashMap<String, List<String>>() {
		{
			put("foo.com/blog/1", Arrays.asList("sally", "bob", "tim", "george", "nathan"));
			put("engineering.twitter.com/blog/5", Arrays.asList("adam", "david", "sally", "nathan"));
			put("tech.backtype.com/blog/123", Arrays.asList("tim", "mike", "john"));
		}
	};

	//手动创造数据(HashMap类型):用户-->此用户的粉丝
	public static Map<String, List<String>> FOLLOWERS_DB = new HashMap<String, List<String>>() {
		{
			put("sally", Arrays.asList("bob", "tim", "alice", "adam", "jim", "chris", "jai"));
			put("bob", Arrays.asList("sally", "nathan", "jim", "mary", "david", "vivian"));
			put("tim", Arrays.asList("alex"));
			put("nathan", Arrays.asList("sally", "bob", "adam", "harry", "chris", "vivian", "emily", "jordan"));
			put("adam", Arrays.asList("david", "carissa"));
			put("mike", Arrays.asList("john", "bob"));
			put("john", Arrays.asList("alice", "nathan", "jim", "mike", "bob"));
		}
	};


	//编写bolt
	public static class GetTweeters extends BaseBasicBolt {

		public void execute(Tuple tuple, BasicOutputCollector collector) {
			//此bolt的数据输入流就是传入的参数(格式:("reach", "foo.com/blog/1"))
			Object id = tuple.getValue(0);  //???getValue应该是获取数值的意思,下标为0的数值可能是随机分配的id值
			String url = tuple.getString(1);//获取下标为1且为字符串类型 正好是url
			List<String> tweeters = TWEETERS_DB.get(url);   //获取数据中转发该url的用户
			if (tweeters != null) {
				for (String tweeter : tweeters) {
					collector.emit(new Values(id, tweeter));//将(id,用户)向下发送
				}
			}
		}

		public void declareOutputFields(OutputFieldsDeclarer declarer) {
			declarer.declare(new Fields("id", "tweeter"));
		}
	}

	//编写bolt
	public static class GetFollowers extends BaseBasicBolt {

		public void execute(Tuple tuple, BasicOutputCollector collector) {
			//此bolt的数据输入流的格式为:("id", "tweeter")
			Object id = tuple.getValue(0);
			String tweeter = tuple.getString(1);
			List<String> followers = FOLLOWERS_DB.get(tweeter);   //获取该用户的粉丝
			if (followers != null) {
				for (String follower : followers) {
					collector.emit(new Values(id, follower));   //将(id,粉丝)向下发送
				}
			}
		}

		public void declareOutputFields(OutputFieldsDeclarer declarer) {
			declarer.declare(new Fields("id", "follower"));
		}
	}

	//编写bolt,注意:此bolt继承的是BaseBatchBolt(之前没有用到过)
	public static class PartialUniquer extends BaseBatchBolt {
		BatchOutputCollector _collector;
		Object _id;
		Set<String> _followers = new HashSet<String>();  //用于粉丝去重,HashSet中的元素没有下标,无序,不会重复

		//初始化定义的属性
		public void prepare(Map conf, TopologyContext context, BatchOutputCollector collector, Object id) {
			_collector = collector;
			_id = id;
		}

		public void execute(Tuple tuple) {
			//此bolt的数据输入流的格式为:("id", "follower")
			_followers.add(tuple.getString(1));  //第二个字段是粉丝(follower),HashSet类自动去重
		}

		public void finishBatch() {
			_collector.emit(new Values(_id, _followers.size()));
		}

		public void declareOutputFields(OutputFieldsDeclarer declarer) {
			declarer.declare(new Fields("id", "partial-count"));
		}
	}

	//编写bolt
	public static class CountAggregator extends BaseBatchBolt {
		BatchOutputCollector _collector;
		Object _id;
		int _count = 0;

		public void prepare(Map conf, TopologyContext context, BatchOutputCollector collector, Object id) {
			_collector = collector;
			_id = id;
		}

		public void execute(Tuple tuple) {
			_count += tuple.getInteger(1);  //将粉丝数量累加
		}

		public void finishBatch() {
			_collector.emit(new Values(_id, _count));
		}

		public void declareOutputFields(OutputFieldsDeclarer declarer) {
			declarer.declare(new Fields("id", "reach"));
		}
	}

	public static LinearDRPCTopologyBuilder construct() {
		LinearDRPCTopologyBuilder builder = new LinearDRPCTopologyBuilder("reach");//创建DRPC实例,将功能命名为reach
		//获取转发过url的人
		builder.addBolt(new GetTweeters(), 4);
		//获取上面的人的粉丝
		builder.addBolt(new GetFollowers(), 12).shuffleGrouping();//GetTweeters按照GetFollowers转发给GetFollowers
		//对粉丝进行去重(使用fieldsGrouping方式,对用户名相同的人的粉丝放在一起去重,)
		builder.addBolt(new PartialUniquer(), 6).fieldsGrouping(new Fields("id", "follower"));//GetFollowers按字段分组方式转发给PartialUniquer
		//最后进行统计人数(使用fieldsGrouping方式,对用户名相同的人的粉丝数量进行累加)
		builder.addBolt(new CountAggregator(), 3).fieldsGrouping(new Fields("id"));
		return builder;
	}

	public static void main(String[] args) throws Exception {
		LinearDRPCTopologyBuilder builder = construct();  //创建DRPC的topology

		Config conf = new Config();

		if (args == null || args.length == 0) {  //如果没有参数,执行本地模式
			conf.setMaxTaskParallelism(3);
			LocalDRPC drpc = new LocalDRPC();
			LocalCluster cluster = new LocalCluster();
			cluster.submitTopology("reach-drpc", conf, builder.createLocalTopology(drpc));
			//本地模式的传参
			String[] urlsToTry = new String[] { "foo.com/blog/1", "engineering.twitter.com/blog/5", "notaurl.com" };
			for (String url : urlsToTry) {
				System.out.println("Reach of " + url + ": " + drpc.execute("reach", url));
			}

			cluster.shutdown();
			drpc.shutdown();
		} else {//如果有参数,执行集群模式,需要client端的代码配合使用
			conf.setNumWorkers(6);
			StormSubmitter.submitTopology(args[0], conf, builder.createRemoteTopology());
		}
	}
}

ReachDrpcClient

package com.xnmzdx.storm.drpc2.client;

import backtype.storm.utils.DRPCClient;

public class ReachDrpcClient {
	
	public static void main(String[] args) throws Exception {
		DRPCClient client = new DRPCClient("192.168.100.50", 3772);
		System.out.println(client.execute("reach", args[0]));
		//其实具体点就是这样传参数System.out.println(client.execute("reach", "foo.com/blog/1"));

	}
	
}

如果在storm集群上运行此程序,则需要将项目打成jar包上传至storm集群上并使用命令运行,然后再运行ReachDrpcClient类

storm jar jar包名称 运行的类的全名称 参数1,参数2,....


 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值