Storm的常见模式---求TOPN

参考了storm-starter和很多网上的例子。
总共需要三个bolt:

//bolt1,负责实时的计算某个但是的统计量;负责清空最旧的数据。
package com.cucc.roam.storm.bolt;
import backtype.storm.task.OutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseRichBolt;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Tuple;
import backtype.storm.tuple.Values;
import backtype.storm.utils.Utils;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import org.apache.log4j.Logger;

public class RollingCountObjects extends BaseRichBolt {
    public static Logger LOG = Logger.getLogger(RollingCountObjects.class);
    private HashMap<Object, long[]> _objectCounts = new HashMap<Object, long[]>();
    private int _numBuckets;//60
    private transient Thread cleaner;
    private OutputCollector _collector;
    private int _trackMinutes;//10

    public RollingCountObjects(int numBuckets, int trackMinutes) {
        _numBuckets = numBuckets;
        _trackMinutes = trackMinutes;
    }

    public long totalObjects (Object obj) {
        long[] curr = _objectCounts.get(obj);
        long total = 0;
        for (long l: curr) {
            total+=l;
        }
        return total;
    }

    public int currentBucket (int buckets) {
        return (currentSecond()  / secondsPerBucket(buckets)) % buckets;
    }

    public int currentSecond() {//系统时间,秒
        return (int) (System.currentTimeMillis() / 1000);
    }

    public int secondsPerBucket(int buckets) {
        return (_trackMinutes * 60 / buckets);//每个桶多少秒,

    }

    public long millisPerBucket(int buckets) {
        return (long) secondsPerBucket(buckets) * 1000;
    }

    public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
        _collector = collector;
        cleaner = new Thread(new Runnable() {
            public void run() {
                Integer lastBucket = currentBucket(_numBuckets);
                StringBuffer sb = new StringBuffer();

                while(true) {
                  int currBucket = currentBucket(_numBuckets);

                  sb.setLength(0);
                  sb.append("\n##########\nbegin,线程while循环: 当前的桶为:" + currBucket);


                  if(currBucket!=lastBucket) {
                      sb.append("\n线程while循环:之前的桶数为:" + lastBucket);

                      int bucketToWipe = (currBucket + 1) % _numBuckets;
                      sb.append("\n线程while循环:要擦除掉的桶为:" + bucketToWipe);

                      synchronized(_objectCounts) {
                          Set objs = new HashSet(_objectCounts.keySet());
                          for (Object obj: objs) {
                            long[] counts = _objectCounts.get(obj);
                            long currBucketVal = counts[bucketToWipe];
                            sb.append("\n线程while循环:擦除掉的值为:" + currBucketVal+",擦除的对象为"+obj);

                            sb.append("\n擦出前数组:");//LOG.info
                            for (long number : counts) {
                                sb.append(number + ":");
                            }

                            counts[bucketToWipe] = 0;
                            sb.append("\n擦出后数组:");
                            for (long number : counts) {
                                sb.append(number + ":");
                            }

                            long total = totalObjects(obj);
                            if(currBucketVal!=0) {
                                sb.append("\n线程while循环:擦除掉的值为不为0:那就发射数据:obj total"
                                            + obj + ":" + total);
                                _collector.emit(new Values(obj, total));
                            }
                            if(total==0) {
                                sb.append("\n线程while循环: 总数为0以后,将obj对象删除,obj="+obj);
                                _objectCounts.remove(obj);
                            }
                          }
                      }
                      lastBucket = currBucket;
                  }
                  long delta = millisPerBucket(_numBuckets) - (System.currentTimeMillis() % millisPerBucket(_numBuckets));
                  Utils.sleep(delta);
                  sb.append("\nsleep="+delta+"毫秒.end#########\n");
                  LOG.info(sb.toString());
                }
            }
        });
        cleaner.start();
    }

    public void execute(Tuple tuple) {
        StringBuffer sb = new StringBuffer();
        sb.setLength(0);
        Object obj = tuple.getValue(0);
        int bucket = currentBucket(_numBuckets);
        sb.append("\n=======\nexecute方法:当前值:"+obj+"当前桶:bucket: " + bucket);
        synchronized(_objectCounts) {
            long[] curr = _objectCounts.get(obj);
            if(curr==null) {
                curr = new long[_numBuckets];
                _objectCounts.put(obj, curr);
                sb.append("\n新建,_objectCounts["+_objectCounts.toString()+"]");
            }
            curr[bucket]++;
            sb.append("\nexecute方法:接受到的merchandiseIDS:" + obj.toString() + ",long数组:\n");

            for (long number : curr) {
                sb.append(number + ":");
            }

            sb.append("\nexecute方法:发射的数据: " + obj + ":" + totalObjects(obj)+"\n========");

            _collector.emit(new Values(obj, totalObjects(obj)));
            _collector.ack(tuple);
            LOG.info(sb.toString());
        }
    }
    public void declareOutputFields(OutputFieldsDeclarer declarer) {
        declarer.declare(new Fields("obj", "count"));
    }
}

bolt2维护一个topn的数据,假如bolt2的并发度为3那么总共有三组topN的数据

package com.cucc.roam.storm.bolt;

import backtype.storm.task.TopologyContext;
import backtype.storm.topology.BasicOutputCollector;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseBasicBolt;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Tuple;
import backtype.storm.tuple.Values;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;

import org.apache.log4j.Logger;


public class RankObjects extends BaseBasicBolt {
    public static Logger LOG = Logger.getLogger(RankObjects.class);

    List<List> _rankings = new ArrayList<List>();

    int _count;
    Long _lastTime = null;

    public RankObjects(int n) {
        _count = n;
    }

    private int _compare(List one, List two) {
        long valueOne = (Long) one.get(1);
        long valueTwo = (Long) two.get(1);
        long delta = valueTwo - valueOne;
        if(delta > 0) {
            return 1;
        } else if (delta < 0) {
            return -1;
        } else {
            return 0;
        }
    }

    private Integer _find(Object tag) {
        for(int i = 0; i < _rankings.size(); ++i) {
            Object cur = _rankings.get(i).get(0);
            if (cur.equals(tag)) {
                return i;
            }
        }
        return null;
    }

    public void execute(Tuple tuple, BasicOutputCollector collector) {
        StringBuffer sb = new StringBuffer();
        Object tag = tuple.getValue(0);
        sb.append("\n~~~~~~~~~\nrank,tag="+tag+",_rankings["+_rankings.toString()+"].");
        Integer existingIndex = _find(tag);
        if (null != existingIndex) {
            sb.append("\nrank,set["+tuple.getValues()+"]..");
            _rankings.set(existingIndex, tuple.getValues());
        } else {
            sb.append("\nrank,add["+tuple.getValues()+"]..");
            _rankings.add(tuple.getValues());
        }
        Collections.sort(_rankings, new Comparator<List>() {
            public int compare(List o1, List o2) {
                return _compare(o1, o2);
            }
        });
        if (_rankings.size() > _count) {
            _rankings.remove(_count);
            sb.append("\nremove后,_rankings["+_rankings.toString()+"].");
        }
        long currentTime = System.currentTimeMillis();
        if(_lastTime==null || currentTime >= _lastTime + 2000) {
            sb.append("\nrank,emit["+_rankings+"]...");
            collector.emit(new Values(new ArrayList(_rankings)));
            _lastTime = currentTime;
        }
        sb.append("\n~~~~~~~~~~~\n");
        LOG.info(sb.toString());
    }

    public void declareOutputFields(OutputFieldsDeclarer declarer) {
        declarer.declare(new Fields("list"));
    }
}

bolt3的作用是合并

package com.cucc.roam.storm.bolt;

import org.apache.log4j.Logger;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.BasicOutputCollector;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseBasicBolt;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Tuple;
import backtype.storm.tuple.Values;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;

public class MergeObjects extends BaseBasicBolt {
    public static Logger LOG = Logger.getLogger(MergeObjects.class);

    private List<List> _rankings = new ArrayList();
    int _count = 10;
    Long _lastTime;

    public MergeObjects(int n) {
        _count = n;
    }

    private int _compare(List one, List two) {
        long valueOne = (Long) one.get(1);
        long valueTwo = (Long) two.get(1);
        long delta = valueTwo - valueOne;
        if(delta > 0) {
            return 1;
        } else if (delta < 0) {
            return -1;
        } else {
            return 0;
        }
    }

    private Integer _find(Object tag) {
        for(int i = 0; i < _rankings.size(); ++i) {
            Object cur = _rankings.get(i).get(0);
            if (cur.equals(tag)) {
                return i;
            }
        }
        return null;
    }

    public void execute(Tuple tuple, BasicOutputCollector collector) {
        StringBuffer sb = new StringBuffer();
        List<List> merging = (List) tuple.getValue(0);
        sb.append("\n$$$$$$$$$$$$$\nmerge,get,merging["+merging.toString()+"].");
        sb.append("\n 开始_rankings["+_rankings.toString()+"]");
        for(List pair : merging) {
            Integer existingIndex = _find(pair.get(0));
            if (null != existingIndex) {
                _rankings.set(existingIndex, pair);
                sb.append("");
            } else {
                _rankings.add(pair);
                sb.append("");
            }

            Collections.sort(_rankings, new Comparator<List>() {
                public int compare(List o1, List o2) {
                    return _compare(o1, o2);
                }
            });

            if (_rankings.size() > _count) {
                _rankings.subList(_count, _rankings.size()).clear();
            }
        }
        sb.append("\n结束_rankings["+_rankings.toString()+"]");

        long currentTime = System.currentTimeMillis();
        if(_lastTime==null || currentTime >= _lastTime + 2000) {
            collector.emit(new Values(new ArrayList(_rankings)));
            LOG.info("\n最后的实时结果Rankings: " + _rankings);
            _lastTime = currentTime;
        }
        sb.append("\n$$$$$$$$$$$$$\n");
        LOG.info(sb.toString());
    }

    public void declareOutputFields(OutputFieldsDeclarer declarer) {
        declarer.declare(new Fields("list"));
    }
}

=============
转载:http://my.oschina.net/infiniteSpace/blog/309784

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
一、基于nginx+lua完成商品详情页访问流量实时上报kafka的开发 ==================================== 在nginx这一层,接收到访问请的时候,就把请的流量上报发送给kafka 这样的话,storm才能去消费kafka中的实时的访问日志,然后去进行缓存热数据的统计 用得技术方案非常简单,从lua脚本直接创建一个kafka producer,发送数据到kafka ``` wget https://github.com/doujiang24/lua-resty-kafka/archive/master.zip yum install -y unzip unzip lua-resty-kafka-master.zip cp -rf /usr/local/lua-resty-kafka-master/lib/resty /usr/hello/lualib nginx -s reload local cjson = require("cjson") local producer = require("resty.kafka.producer") local broker_list = { { host = "192.168.31.187", port = 9092 }, { host = "192.168.31.19", port = 9092 }, { host = "192.168.31.227", port = 9092 } } local log_json = {} log_json["headers"] = ngx.req.get_headers() log_json["uri_args"] = ngx.req.get_uri_args() log_json["body"] = ngx.req.read_body() log_json["http_version"] = ngx.req.http_version() log_json["method"] =ngx.req.get_method() log_json["raw_reader"] = ngx.req.raw_header() log_json["body_data"] = ngx.req.get_body_data() local message = cjson.encode(log_json); local productId = ngx.req.get_uri_args()["productId"] local async_producer = producer:new(broker_list, { producer_type = "async" }) local ok, err = async_producer:send("access-log", productId, message) if not ok then ngx.log(ngx.ERR, "kafka send err:", err) return end ``` 两台机器上都这样做,才能统一上报流量到kafka ``` bin/kafka-topics.sh --zookeeper 192.168.31.187:2181,192.168.31.19:2181,192.168.31.227:2181 --topic access-log --replication-factor 1 --partitions 1 --create bin/kafka-console-consumer.sh --zookeeper 192.168.31.187:2181,192.168.31.19:2181,192.168.31.227:2181 --topic access-log --from-beginning ``` (1)kafka在187上的节点死掉了,可能是虚拟机的问题,杀掉进程,重新启动一下 nohup bin/kafka-server-start.sh config/server.properties & (2)需要在nginx.conf中,http部分,加入resolver 8.8.8.8; (3)需要在kafka中加入advertised.host.name = 192.168.31.187,重启三个kafka进程 (4)需要启动eshop-cache缓存服务,因为nginx中的本地缓存可能不在了 二、基于storm+kafka完成商品访问次数实时统计拓扑的开发 ==============

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值