014-案例开发.Storm计算网站PV

采用Storm统计网站的PV,需要从两个方面考虑(1) 性能问题  (2) 线程安全考虑


一、需求分析 
(1)网站最常用访问量指标
PV(page views): count (session_id)

(2)多线程下,注意线程安全问题
PV统计

方案分析
如下是否可行?
1、定义static long pv, Synchronized 控制累计操作
Synchronized 和 Lock在单JVM下有效,但在多JVM下无效
常用的代码一般为:设置 static变量和通过 synchronized关键字处理,代码为:
   static private long pv = 0; //多线程情况下共享变量,单JVM没有问题。分布式系统中该变量统计结果存在问题
     /**
     * Process a single tuple of input.
     * @param input The input tuple to be processed.
     */
     @Override
     public void execute(Tuple input ) {
         
         String line = input .getStringByField("line" );
          synchronized (this ) {//多线程情况下代码块异步处理,单JVM没有问题
             
              if (StringUtils.isNotBlank( line)){
                  pv ++;
             }
         }
         Thread currentThread = Thread.currentThread();
         System. out .println(currentThread .getName() + "[" +currentThread .getId()+ "]" + "->" + pv );
    }

可行的两个方案:
1、shuffleGrouping下,pv * Executer并发数
2、bolt1进行多并发局部汇总,bolt2单线程进行全局汇总

线程安全:多线程处理的结果和单线程一致

二、统计PV的流程图以及Storm代码



采用Storm进行数据汇总的大致步骤: 通过以及bolt高并发多线程情况下统计出来部分的数据,然后通过单线程二级bolt进行整体汇总,请求结果
 (1)  PVTopology主程序
package com.yun.storm.pv;

import java.util.HashMap;
import java.util.Map;

import backtype.storm.Config;
import backtype.storm.LocalCluster;
import backtype.storm.topology.TopologyBuilder;

/**
 * 实时统计PV拓扑
 * @author shenfl
 * @version V1.0
 */
public class PVTopology {

    
    public final static String SPOUT_ID = PVSpout.class.getSimpleName();
    public final static String PVBOLT_ID = PVBolt.class.getSimpleName();
    public final static String PVTOPOLOGY_ID = PVTopology.class.getSimpleName();
    public final static String PVSUMBOLT_ID = PVSumBolt.class.getSimpleName();
    
    public static void main(String[] args) {
         TopologyBuilder builder = new TopologyBuilder();
         
          /*//表示kafka使用的zookeeper的地址
         String brokerZkStr = "192.168.35:2181,192.168.36:2181,192.168.37:2181";
         ZkHosts zkHosts = new ZkHosts(brokerZkStr);
         //表示的是kafak中存储数据的主题名称
         String topic = "pvtopic";
         //指定zookeeper中的一个根目录,里面存储kafkaspout读取数据的位置等信息
         String zkRoot = "/kafkaspout";
         String id = UUID.randomUUID().toString();
         SpoutConfig spoutconf  = new SpoutConfig(zkHosts, topic, zkRoot, id);
         
         builder.setSpout(SPOUT_ID , new KafkaSpout(spoutconf),1);//单线程*/
          builder.setSpout( SPOUT_ID, new PVSpout(),1);
          builder.setBolt( PVBOLT_ID, new PVBolt(), 4).setNumTasks(8).shuffleGrouping(SPOUT_ID );//2个线程,4个task实例
          builder.setBolt( PVSUMBOLT_ID, new PVSumBolt(), 1).shuffleGrouping(PVBOLT_ID );//单线程汇总
         
         Map<String,Object> conf = new HashMap<String,Object>();
         conf.put(Config. TOPOLOGY_RECEIVER_BUFFER_SIZE , 8);
         conf.put(Config. TOPOLOGY_TRANSFER_BUFFER_SIZE , 32);
         conf.put(Config. TOPOLOGY_EXECUTOR_RECEIVE_BUFFER_SIZE , 16384);
         conf.put(Config. TOPOLOGY_EXECUTOR_SEND_BUFFER_SIZE , 16384);

    /*    try {
             StormSubmitter.submitTopology(PVTOPOLOGY_ID, conf, builder.createTopology());
         } catch (Exception e) {
             e.printStackTrace();
         } */
         LocalCluster cluster = new LocalCluster();
          cluster.submitTopology( PVTOPOLOGY_ID, conf ,builder.createTopology());
    }
}   



模拟数据源:
package com.yun.storm.pv;

import java.io.File;
import java.io.IOException;
import java.util.Collection;
import java.util.List;
import java.util.Map;

import org.apache.commons.io.FileUtils;

import com.yun.redis.PropertyReader;

import backtype.storm.spout.SpoutOutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseRichSpout;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Values;

/**
* 实时读取PV用户行为日志数据 可以从数据库读取数据,可以从kafka读取数据,可以从文件系统读取数据
*
* @author shenfl
* @version V1.0
*/
public class PVSpout extends BaseRichSpout {

     /**
     *
     */
     private static final long serialVersionUID = 1L;

     private SpoutOutputCollector collector;
     private Map stormConf;
     /**
     * 当PVSpout初始化时候调用一次
     *
     * @param conf
     *            The Storm configuration for this spout.
     * @param context
     *            可以获取每个任务的TaskID
     * @param collector
     *            The collector is used to emit tuples from this spout.
     */
     @Override
     public void open(Map stormConf, TopologyContext context, SpoutOutputCollector collector) {
          this.collector = collector;
          this.stormConf = stormConf;
     }

     /**
     * 死循环,一直会调用
     */
     @Override
     public void nextTuple() {

          // 获取数据源
          try {
               String dataDir = PropertyReader.getProperty("parameter.properties", "data.dir");
               File file = new File(dataDir);
               //获取文件列表
               Collection<File> listFiles = FileUtils.listFiles(file, new String[]{"log"},true);
              
               for (File f : listFiles) {
                    //处理文件
                    List<String> readLines = FileUtils.readLines(f);
                    for (String line : readLines) {
                         this.collector.emit(new Values(line));
                    }
                    // 文件已经处理完成
                    try {
                         File srcFile = f.getAbsoluteFile();
                         File destFile = new File(srcFile + ".done." + System.currentTimeMillis());
                         FileUtils.moveFile(srcFile, destFile);
                    } catch (IOException e) {
                         e.printStackTrace();
                    }
               }
          } catch (Exception e) {
               e.printStackTrace();
          }
     }
     /**
     * Declare the output schema for all the streams of this topology.
     *
     * @param declarer
     *            this is used to declare output stream ids, output fields, and
     *            whether or not each output stream is a direct stream
     */
     @Override
     public void declareOutputFields(OutputFieldsDeclarer declarer) {
          declarer.declare(new Fields("line"));
     }
}


 (2)  一级bolt,高并发情况局部汇总
package com.yun.storm.pv;

import java.util.Map;

import org.apache.commons.lang.StringUtils;

import backtype.storm.task.OutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseRichBolt;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Tuple;
import backtype.storm.tuple.Values;

/**
 * 获取PVSpout发送的数据,PVTopology开启多线程。 给出每个线程处理的PV数
 *
 * 在多线程情况下,对PV数据只能局部汇总,不能整体汇总,可以把局部汇总的结果给一个单线程的BOLT进行整体汇总(PVSumBolt)
 *
 * @author shenfl
 * @version V1.0
 */
public class PVBolt extends BaseRichBolt {

    /**
     *
     */
    private static final long serialVersionUID = 1L;
    private OutputCollector collector;
    private TopologyContext context;

    /**
     * 实例初始化的时候调用一次
     *
     * @param stormConf
     *            The Storm configuration for this bolt.
     * @param context
     *            This object can be used to get information about this task's
     *            place within the topology, including the task id and component
     *            id of this task, input and output information, etc.
     * @param collector
     *            The collector is used to emit tuples from this bolt
     */
    @Override
    public void prepare(Map stormConf, TopologyContext context, OutputCollector collector ) {
          this.context = context ;
          this.collector = collector ;
    }

    private long pv = 0;

    /**
     * Process a single tuple of input.
     *
     * @param input
     *            The input tuple to be processed.
     */
    @Override
    public void execute(Tuple input ) {

          try {
             String line = input.getStringByField("line" );
              if (StringUtils.isNotBlank( line)) {
                  pv++;
             }
              //System.out.println(Thread.currentThread().getName() + "[" + Thread.currentThread().getId() + "]" +context.getThisTaskId()+ "->" + pv);
              //this.collector.emit(new Values(Thread.currentThread().getId(),pv));//仅适合一个线程和一个task情况
              this.collector .emit(new Values(context.getThisTaskId(),pv ));// 一个线程和1或多个task的情况,TaskId唯一
              this.collector .ack(input );
         } catch(Exception e ){
              e.printStackTrace();
              this.collector .fail(input );
         }
    }

    /**
     * Declare the output schema for all the streams of this topology.
     *
     * @param declarer
     *            this is used to declare output stream ids, output fields, and
     *            whether or not each output stream is a direct stream
     */
    @Override
    public void declareOutputFields(OutputFieldsDeclarer declarer ) {
          declarer.declare( new Fields("taskId","pv" ));
    }
}


 (3)二 级bolt,单线程汇总
package com.yun.storm.pv;

import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.commons.collections.MapUtils;
import org.apache.hadoop.hbase.client.Result;

import com.yun.hbase.HBaseUtils;

import backtype.storm.task.OutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseRichBolt;
import backtype.storm.tuple.Tuple;

/**
* 汇总PVBolt多个线程的结果
* @author shenfl
*
*/
public class PVSumBolt extends BaseRichBolt{
     /**
     *
     */
     private static final long serialVersionUID = 1L;
     private OutputCollector collector;
     private Map<Integer,Long> map = new HashMap<Integer,Long>();//<日期,PV数>

     @Override
     public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
          this.collector = collector;
     }
     @Override
     public void execute(Tuple input) {
          try {
               Integer taskId = input.getIntegerByField("taskId");
               Long pv = input.getLongByField("pv");
               map.put(taskId, pv);//map个数为task实例数
              
               long sum = 0;//获取总数,遍历map 的values,进行sum
               for (Entry<Integer, Long> e : map.entrySet()) {
                    sum += e.getValue();
               }
               System.out.println("当前时间:"+System.currentTimeMillis()+"pv汇总结果:" + "->" + sum);
               this.collector.ack(input);
          }catch(Exception e){
               e.printStackTrace();
               this.collector.fail(input);
          }
     }

     @Override
     public void declareOutputFields(OutputFieldsDeclarer declarer) {
         
     }
}



评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

艾文教编程

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值