storm两类统计逻辑
汇总型:如网站pv,销售额,订单数等
去重:网站UV,顾客数,销售商品数等
汇总型方案:
1,shuffleGrouping下,pv(单线程结果)*executor并发数
一个executor默认一个task,如果设置task数大于1,公式应该是:
pv(单线程结果)*task数
同一个executor下task的线程id相同,但是taskid不同
优点:简单,计算量小
缺点:稍有误差,但绝大多数场景能接受
优化:
案例PVBolt中每个task都会输出一个汇总量,实际只需要一个task输出总值,利用zookeeper锁来做到只有一个task输出汇总数,而且每5S输出一次
2,bolt1进行多并发局部汇总,bolt2单线程进行全局汇总
优点:1,绝对准确;2,如果用fieldGrouping可以得到中间值,如单个user的访问PV(访问深度,也是有用指标)
缺点:计算量稍大,且多一个bolt
生产数据:
package base; import backtype.storm.spout.SpoutOutputCollector; import backtype.storm.task.TopologyContext; import backtype.storm.topology.IRichSpout; import backtype.storm.topology.OutputFieldsDeclarer; import backtype.storm.tuple.Fields; import backtype.storm.tuple.Values; import java.util.Map; import java.util.Queue; import java.util.Random; import java.util.concurrent.ConcurrentLinkedQueue; /** * Created by Administrator on 2016/10/7. */ public class SourceSpout implements IRichSpout{ /* 数据源Spout */ private static final long serialVersionUID = 1L; Queue<String> queue = new ConcurrentLinkedQueue<String>(); SpoutOutputCollector collector = null; String str = null; @Override public void open(Map map, TopologyContext topologyContext, SpoutOutputCollector spoutOutputCollector) { try{ this.collector = spoutOutputCollector; Random random = new Random(); String[] hosts = {"www.taobao.com"}; String[] session_id = { "ABYH6Y4V4SCVXTG6DPB4VH9U123", "XXYH6YCGFJYERTT834R52FDXV9U34", "BBYH61456FGHHJ7JL89RG5VV9UYU7", "CYYH6Y2345GHI899OFG4V9U567", "VVVYH6Y4V4SFXZ56JIPDPB4V678" }; String[] time = { "2014-01-07 08:40:50", "2014-01-07 08:40:51", "2014-01-07 08:40:52", "2014-01-07 08:40:53", "2014-01-07 09:40:49", "2014-01-07 10:40:49", "2014-01-07 11:40:49", "2014-01-07 12:40:49" }; for (int i = 0;i < 100; i++){ queue.add(hosts[0]+"\t"+session_id[random.nextInt(5)]+"\t"+time[random.nextInt(8)]); } }catch (Exception e){ e.printStackTrace(); } } @Override public void close() { } @Override public void activate() { } @Override public void deactivate() { } @Override public void nextTuple() { if(queue.size() >= 0){ collector.emit(new Values(queue.poll())); try { Thread.sleep(200); } catch (InterruptedException e) { e.printStackTrace(); } } } @Override public void ack(Object o) { System.out.println("spout ack:"+o.toString()); } @Override public void fail(Object o) { System.out.println("spout fail:"+o.toString()); } @Override public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) { outputFieldsDeclarer.declare(new Fields("log")); } @Override public Map<String, Object> getComponentConfiguration() { return null; } }
处理数据:
package visits; import backtype.storm.task.OutputCollector; import backtype.storm.task.TopologyContext; import backtype.storm.topology.IRichBolt; import backtype.storm.topology.OutputFieldsDeclarer; import backtype.storm.tuple.Tuple; import backtype.storm.utils.ZookeeperAuthInfo; import org.apache.http.conn.util.InetAddressUtils; import org.apache.zookeeper.*; import java.net.InetAddress; import java.util.Map; /** * Created by Administrator on 2016/10/6. * storm kill作业 * storm kill PvTopo * storm提交作业 * storm jar ./starter.jar visits.PvTopo PvTopo * */ public class PVBolt implements IRichBolt { /* 这种irichbolt形式就是成功的时候要显性的调ack方法 失败的时候掉fail方法 */ private static final long serialVersionUID = 1L; /* 执行前需要在zookeeper上把目录建立一下 zkCli.sh -server localhost:2181 ls / create /lock "" create /lock/storm "" ls /lock 这样就创建好了 */ public static final String zk_path = "/lock/storm/pv"; ZooKeeper zKeeper = null; String lockData = null; @Override public void prepare(Map map, TopologyContext topologyContext, OutputCollector outputCollector) { try{ zKeeper = new ZooKeeper("192.168.1.107:2181,192.168.1.108:2181",3000,new Watcher(){ @Override public void process(WatchedEvent watchedEvent) { System.out.println("event:"+watchedEvent.getType()); } }); while (zKeeper.getState() != ZooKeeper.States.CONNECTED){ Thread.sleep(1000); } InetAddress address = InetAddress.getLocalHost(); //ip地址和taskip的组合肯定是唯一的 lockData = address.getHostAddress() + ":" + topologyContext.getThisTaskId(); //false的意思是不放监听上去 if(zKeeper.exists(zk_path, false) == null){ zKeeper.create(zk_path, lockData.getBytes(), ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL); } }catch(Exception e){ try { zKeeper.close(); } catch (InterruptedException e1) { e1.printStackTrace(); } } } String logString = null; String sessionid = null; //static long Pv = 0; long Pv = 0; long beginTime = System.currentTimeMillis(); long endTimes = 0; @Override public void execute(Tuple tuple) { try{ endTimes = System.currentTimeMillis(); logString = tuple.getString(0); if (logString != null){ sessionid = logString.split("\t")[1]; /* *//* 这种多线程下做计算我们还必须得有synchronized使它线程安全 这样肯定就和单线程一样 然而还是不够健全,因为synchronized和lock在单jvm下有效,单在多jvm下无效 *//* synchronized (this){ if(sessionid != null){ Pv ++; } }*/ //shuffleGrouping下,pv* Executor并发数就是统计的pv //因为shufflegrouping是平均分配,而我们有两个线程 // if(sessionid != null){ Pv ++; } } if (endTimes - beginTime >= 5*1000){ System.err.println(lockData+"========================="); if(lockData.equals(zKeeper.getData(zk_path, false, null))){ System.out.println("pv ================== "+ Pv * 4); } beginTime = System.currentTimeMillis(); } }catch(Exception e){ e.printStackTrace(); } } @Override public void cleanup() { } @Override public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) { } @Override public Map<String, Object> getComponentConfiguration() { return null; } }
执行main:
package visits; import backtype.storm.Config; import backtype.storm.LocalCluster; import backtype.storm.StormSubmitter; import backtype.storm.generated.AlreadyAliveException; import backtype.storm.generated.InvalidTopologyException; import backtype.storm.topology.TopologyBuilder; import base.SourceSpout; import myfirst.MySpout; import java.util.HashMap; import java.util.Map; public class PvTopo { /** * @param args * 多并发下是无法做全局汇总的 */ public static void main(String[] args) { // TODO Auto-generated method stub TopologyBuilder builder = new TopologyBuilder(); builder.setSpout("spout", new SourceSpout(), 1); builder.setBolt("bolt", new PVBolt(), 4).shuffleGrouping("spout"); /* 对单线程来讲,什么grouping都是一样的。。 */ builder.setBolt("sumBolt", new PVSumBolt(), 1).shuffleGrouping("bolt"); Map conf = new HashMap(); conf.put(Config.TOPOLOGY_WORKERS, 4); if (args.length > 0) { try { StormSubmitter.submitTopology(args[0], conf, builder.createTopology()); } catch (AlreadyAliveException e) { e.printStackTrace(); } catch (InvalidTopologyException e) { e.printStackTrace(); } }else { LocalCluster localCluster = new LocalCluster(); localCluster.submitTopology("mytopology", conf, builder.createTopology()); } } }