【Storm】Trident实战之计算网站PV

需求

用Trident实现汇总型PV统计;

1:按天

2:累计

FixedBatchSpout发送若干个tuple,每个tuple是一个字符串,字符串格式是 hosts + "\t" + sessionId + "\t" + times,其中times是日期 + 时间。每个batch最多包含3个tuple,不设置为持续发送。DRPC 函数名称 getPV,execute第二个参数是日期. 可以查询多天,但日期之间要用空格隔开;

(1)MySplit类,继承 BaseFunction;传入的pattern作为分割符号;传入的字符串,按照pattern分割,发射2个字段,date,sessionId。其中date只含有日期。

package trident.function;

import Util.DateFmt;
import backtype.storm.tuple.Values;
import storm.trident.operation.BaseFunction;
import storm.trident.operation.TridentCollector;
import storm.trident.tuple.TridentTuple;

public class MySplit extends BaseFunction{
	
	private static final long serialVersionUID = 1L;
	
	String pattern = null;
	
	public MySplit(String pattern) {
		this.pattern = pattern;
	}
	
	@Override
	public void execute(TridentTuple tuple, TridentCollector collector) {
		
		// 接收到每一行log后做处理,用pattern隔开,这里的是用 "/t" 隔开
		String log = tuple.getString(0);
		String logArr[] = log.split(pattern);
		
		if (3 == logArr.length) {
			// 发射的2个field分别是,date,sessionId
			collector.emit(new Values(DateFmt.getCountDate(logArr[2], DateFmt.DATE_SHORT), logArr[1]));
		}
		
	}
	
}

(2)Split类,继承 BaseFunction;传入的pattern作为分割符号;传入的字符串,按照pattern分割,分割后的字符串逐个发送;

package trident.function;

import backtype.storm.tuple.Values;
import storm.trident.operation.BaseFunction;
import storm.trident.operation.TridentCollector;
import storm.trident.tuple.TridentTuple;

public class Split extends BaseFunction {
	
	private static final long serialVersionUID = 1L;
	
	String pattern = null;
	
	public Split(String pattern) {
		this.pattern = pattern;
	}
	
	@Override
	public void execute(TridentTuple tuple, TridentCollector collector) {
		
		String sentence = tuple.getString(0);
		
		for (String word : sentence.split(pattern)) {
			collector.emit(new Values(word));
		}
	}
}

(3)FixedBatchSpout作为数据源;TridentState存储spout处理完后的结果;用drpc发起查询;查询的是 "2019-09-15 2019-09-16" 这两个每天有多少PV;

package trident;

import java.util.Random;

import backtype.storm.Config;
import backtype.storm.LocalCluster;
import backtype.storm.LocalDRPC;
import backtype.storm.StormSubmitter;
import backtype.storm.generated.StormTopology;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Values;
import storm.trident.TridentState;
import storm.trident.TridentTopology;
import storm.trident.operation.builtin.Count;
import storm.trident.operation.builtin.FilterNull;
import storm.trident.operation.builtin.MapGet;
import storm.trident.testing.FixedBatchSpout;
import storm.trident.testing.MemoryMapState;
import trident.function.MySplit;
import trident.function.Split;

public class TridentPVTopo {
	
	public static StormTopology buildTopology(LocalDRPC drpc) {

		Random random = new Random();

		// 登录的网站是taobao
		String hosts = "www.taobao.com";
		// 每次登录的session id
		String[] sessionId = { "5GFBAT3D3100A7A7255027A70", "5X16BCA8823AC4BD9CD196A5D", "5CFBA5BD76BACF436ACA9DCC8",
				"5D16C3E0209C16DEAA28L1824", "5I16CB309251CCF6CE6223BA1", "5C16BC4MB91B85661FE22F413",
				"5D16C1F5191CF9371Y32B58CF", "5D16C7A886E2P2AE3EA29FC3E", "5C3FBA728FD7D264B80769B23",
				"5B16C0F7215109AG43528BA2D", "5N16C2FE51E5619C2A1244215", "5D16C1EB1C7A751AE03201C3F" };

		// 登录的时间
		String[] times = { "2019-09-15 08:01:36", "2019-09-15 08:11:37", "2019-09-15 08:31:38", "2019-09-15 09:23:07",
				"2019-09-15 10:51:27", "2019-09-15 10:51:56", "2019-09-15 11:01:07", "2019-09-15 11:01:20",
				"2019-09-16 11:45:30", "2019-09-16 12:31:49", "2019-09-16 12:41:51", "2019-09-16 12:51:37",
				"2019-09-16 13:11:27", "2019-09-16 13:20:40", "2019-09-16 13:31:38"};

		// spout,数据源是若干个tuple,每个tuple是一行语句,每个batch包含的tuple最多为3个
		FixedBatchSpout spout = new FixedBatchSpout(new Fields("eachLog"), 3,
				new Values(hosts + "\t" + sessionId[random.nextInt(12)] + "\t" + times[random.nextInt(15)]),
				new Values(hosts + "\t" + sessionId[random.nextInt(12)] + "\t" + times[random.nextInt(15)]),
				new Values(hosts + "\t" + sessionId[random.nextInt(12)] + "\t" + times[random.nextInt(15)]),
				new Values(hosts + "\t" + sessionId[random.nextInt(12)] + "\t" + times[random.nextInt(15)]),
				new Values(hosts + "\t" + sessionId[random.nextInt(12)] + "\t" + times[random.nextInt(15)]),
				new Values(hosts + "\t" + sessionId[random.nextInt(12)] + "\t" + times[random.nextInt(15)]),
				new Values(hosts + "\t" + sessionId[random.nextInt(12)] + "\t" + times[random.nextInt(15)]),
				new Values(hosts + "\t" + sessionId[random.nextInt(12)] + "\t" + times[random.nextInt(15)]));

		// 如果设置为true,spout会持续发送数据
		spout.setCycle(false);

		TridentTopology topology = new TridentTopology();

		// 把spout当作数据源
		// 最终结果存储在 TridentState
		TridentState pvCounts = topology.newStream("spout1", spout)
//				.parallelismHint(16)       // 为了方便观察,注释并发度
				.each(new Fields("eachLog"), new MySplit("\t"), new Fields("date", "sessionId"))
				.groupBy(new Fields("date"))      // 按日期group by
				// 持久化到MemoryMapState,存在内存中的map。也可以存到第三方存储。state是trident的核心概念
				.persistentAggregate(new MemoryMapState.Factory(), new Fields("sessionId"), new Count(), new Fields("PV"));
//				.parallelismHint(16);      // 为了方便观察,注释并发度

		// 分布式查询
		topology.newDRPCStream("getPV", drpc)
				.each(new Fields("args"), new Split(" "), new Fields("date"))
				.groupBy(new Fields("date"))         // 按日期分组,如果传入的args参数,日期一样,有去重作用
				.stateQuery(pvCounts, new Fields("date"), new MapGet(), new Fields("PV"))
				.each(new Fields("PV"), new FilterNull());       // 过滤为空的

		return topology.build();
	}

	public static void main(String[] args) throws Exception {
		Config conf = new Config();
		conf.setMaxSpoutPending(20);

		// 如果没有参数,本地模式提交
		if (args.length == 0) {
			LocalDRPC drpc = new LocalDRPC();
			LocalCluster cluster = new LocalCluster();
			cluster.submitTopology("PVCounter", conf, buildTopology(drpc));
			// 查询100次,每次等待1秒
			for (int i = 0; i < 100; i++) {
				// 第一个是 DRPC 函数名称 getPV,第二个是参数,这里的参数是日期. 可以查询多天,但日期之间要用空格隔开
				System.err.println("DRPC RESULT: " + drpc.execute("getPV", "2019-09-15 2019-09-16"));
				Thread.sleep(1000);
			}
		} else { // 分布式模式提交
			conf.setNumWorkers(3);
			StormSubmitter.submitTopology(args[0], conf, buildTopology(null));
		}
	}
}

打印出的日志,2019-09-15有2个PV,2019-09-16有6个PV;

DRPC RESULT: [["2019-09-15 2019-09-16","2019-09-15",2],["2019-09-15 2019-09-16","2019-09-16",6]]

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值