package com.uplooking.bigdata.storm.statictics;
import com.uplooking.bigdata.storm.utils.JedisUtils;
import org.apache.storm.Config;
import org.apache.storm.Constants;
import org.apache.storm.LocalCluster;
import org.apache.storm.StormSubmitter;
import org.apache.storm.generated.AlreadyAliveException;
import org.apache.storm.generated.AuthorizationException;
import org.apache.storm.generated.InvalidTopologyException;
import org.apache.storm.generated.StormTopology;
import org.apache.storm.kafka.BrokerHosts;
import org.apache.storm.kafka.KafkaSpout;
import org.apache.storm.kafka.SpoutConfig;
import org.apache.storm.kafka.ZkHosts;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.TopologyBuilder;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.topology.base.BaseRichSpout;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
import redis.clients.jedis.Jedis;
import java.util.*;
/**
* 实时统计网站产生的用户行为,计算出截止到目前为止每个时间间隔的PV和UV的情况
* 这里面用到的技术
* 1、nginx日志 在某一个目录下面,就需要使用flume去监听该目录(新增文件或者新增文件内容)
* 2、将flume采集过来的程序放入到kafka中进行消息缓存
* 3、因为要进行实时统计,那么则使用storm进行一个从kafka中获取数据,进行计算截止到目前为止各个时段内PV和UV
* 4、计算的结果要进行落地,将结果保存到redis中
*
* 怎么来做?
* Flume和Kafka的整合---->生产数据
* 配置文件见exec-kafka.conf
* 创建两个topic:
* test-fk ---->用于测试
* bin/kafka-topics.sh --create --topic test-fk --zookeeper master:2181,slave01:2181,slave02:2818 --partitions 3 --replication-factor 3
* flume-kafka-storm ---->用于生产
* bin/kafka-topics.sh --create --topic flume-kafka-storm --zookeeper master:2181,slave01:2181,slave02:2818 --partitions 3 --replication-factor 3
*
* 测试从flume向kafka生产数据
* flume
* nohup /opt/flume/bin/flume-ng agent -c conf -f conf/exec-kafka.conf -name a1
* kafka
* bin/kafka-console-consumer.sh --topic test-fk --zookeeper master:2181,slave01:2181,slave02:2181
* Kafka和storm的整合---->消费数据
* 用storm去消费kafka中某一个topic的数据,那么storm中的spout是谁?kafka中的某一个topic
* storm和redis的整合---->数据落地
* 用我们之前学习过的JedisUtil
* 可以将redis替换为hbase、mysql
*/
public class StatisticsPVAndUVTopology {
public static void main(String[] args) throws Exception {
/**
* BrokerHosts hosts ---->storm要连接的kafka的对应的zk列表
* String topic ---->storm要消费的kafka的topic
* String zkRoot ---->storm在kafka消费的过程中需要在zk中设置一个工作目录
* String id ---->storm在kafka中消费过程中生产一个标识ID
*/
String zkStr = "master:2181,slave01:2181,slave02:2181";
BrokerHosts hosts = new ZkHosts(zkStr);
String topic = "test-fk";
String zkRoot = "/storm-kafka-test";
String id = "storm_kafka_test_id";
SpoutConfig spoutConf = new SpoutConfig(hosts, topic, zkRoot, id);
TopologyBuilder builder = new TopologyBuilder();
//设置topology中的spout
builder.setSpout("kafka_spout_id", new KafkaSpout(spoutConf));
//设置bolt
builder.setBolt("split_bolt_id", new SplitBolt()).shuffleGrouping("kafka_spout_id");
builder.setBolt("static_bolt_id", new StaticBolt()).shuffleGrouping("split_bolt_id");
//启动topology
StormTopology stormTopology = builder.createTopology();
String topologyName = StatisticsPVAndUVTopology.class.getSimpleName();
Config config = new Config();
if(args == null || args.length < 1) {
LocalCluster cluster = new LocalCluster();
cluster.submitTopology(topologyName, config, stormTopology);
}else {
StormSubmitter.submitTopology(topologyName, config, stormTopology);
}
}
//获取spout从kafka中提取的数据,进行操作
//数据格式:27.19.74.143##2016-05-30 17:38:20##GET /static/image/common/faq.gif HTTP/1.1##200##1127
static class SplitBolt extends BaseRichBolt {
private Map conf;
private TopologyContext context;
private OutputCollector collector;
public void prepare(Map conf, TopologyContext context, OutputCollector collector) {
this.conf = conf;
this.context = context;
this.collector = collector;
}
/**
* java.lang.ClassCastException: [B cannot be cast to java.lang.String
* 数据在kafka中直接存储的是字节数组,不能使用字符串进行接收
* @param tuple
*/
public void execute(Tuple tuple) {
String line = new String(tuple.getBinary(0));
String[] items = line.split("##");
if(items == null || items.length < 2) {
return;
}
System.out.println("kafka中的数据:ip--->" + items[0] + ", time--->" + items[1]);
this.collector.emit(new Values(items[0], items[1]));
}
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("ip", "time"));
}
}
/**
* 分时段统计PV和UV
* 接收到的数据:
* ip--->27.19.74.143
* time->2016-05-30 17:38:20
*
* 我们提取出时段,每一个时段都对应了一个ip地址的集合
* 我们可以将时段作为key,ip地址对应的集合作为一个value
* Map<String, List<String>>;
*
* key--->list的大小(PV),---->UV list====>set--->set.size-->UV
*
*
*
*/
static class StaticBolt extends BaseRichBolt {
private Map conf;
private TopologyContext context;
private OutputCollector collector;
public void prepare(Map conf, TopologyContext context, OutputCollector collector) {
this.conf = conf;
this.context = context;
this.collector = collector;
}
Map<String, List<String>> map = new HashMap<String, List<String>>();
List<String> list = null;
Set<String> set = null;
@Override
public void execute(Tuple input) {
String hour = null;
if(!input.getSourceComponent().equalsIgnoreCase(Constants.SYSTEM_COMPONENT_ID)) {
String ip = input.getStringByField("ip");
String timeStr = input.getStringByField("time");
hour = timeStr.substring(11, 13);
list = map.get(hour);//key一定是时间段hour,不是ip
if (list == null) {
System.out.println("-----1111-----");
list = new ArrayList<>();
}
list.add(ip);
map.put(hour, list);//每一个时段所对应的ip地址的集合
} else {
//求出PU和UV
System.out.println("===============start================");
System.out.println(map);
System.out.println("------------------------------------");
Jedis jedis = JedisUtils.getJedis();
for (Map.Entry<String, List<String>> me : map.entrySet()) {
hour = me.getKey();
list = me.getValue();
int pv = list.size();
set = new HashSet<String>(list);
int uv = set.size();
// System.out.println("hour: " + hour + ",pv: " + pv + ",uv:" + uv);
//将数据写到redis
jedis.hset("pv_uv_" + hour, "pv", pv + "");
jedis.hset("pv_uv_" + hour, "uv", uv + "");
}
JedisUtils.close(jedis);
System.out.println("===============end=================");
}
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
}
@Override
public Map<String, Object> getComponentConfiguration() {
Map<String, Object> conf = new HashMap<>();
conf.put(Config.TOPOLOGY_TICK_TUPLE_FREQ_SECS, 10);
return conf;
}
}
}
import com.uplooking.bigdata.storm.utils.JedisUtils;
import org.apache.storm.Config;
import org.apache.storm.Constants;
import org.apache.storm.LocalCluster;
import org.apache.storm.StormSubmitter;
import org.apache.storm.generated.AlreadyAliveException;
import org.apache.storm.generated.AuthorizationException;
import org.apache.storm.generated.InvalidTopologyException;
import org.apache.storm.generated.StormTopology;
import org.apache.storm.kafka.BrokerHosts;
import org.apache.storm.kafka.KafkaSpout;
import org.apache.storm.kafka.SpoutConfig;
import org.apache.storm.kafka.ZkHosts;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.TopologyBuilder;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.topology.base.BaseRichSpout;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
import redis.clients.jedis.Jedis;
import java.util.*;
/**
* 实时统计网站产生的用户行为,计算出截止到目前为止每个时间间隔的PV和UV的情况
* 这里面用到的技术
* 1、nginx日志 在某一个目录下面,就需要使用flume去监听该目录(新增文件或者新增文件内容)
* 2、将flume采集过来的程序放入到kafka中进行消息缓存
* 3、因为要进行实时统计,那么则使用storm进行一个从kafka中获取数据,进行计算截止到目前为止各个时段内PV和UV
* 4、计算的结果要进行落地,将结果保存到redis中
*
* 怎么来做?
* Flume和Kafka的整合---->生产数据
* 配置文件见exec-kafka.conf
* 创建两个topic:
* test-fk ---->用于测试
* bin/kafka-topics.sh --create --topic test-fk --zookeeper master:2181,slave01:2181,slave02:2818 --partitions 3 --replication-factor 3
* flume-kafka-storm ---->用于生产
* bin/kafka-topics.sh --create --topic flume-kafka-storm --zookeeper master:2181,slave01:2181,slave02:2818 --partitions 3 --replication-factor 3
*
* 测试从flume向kafka生产数据
* flume
* nohup /opt/flume/bin/flume-ng agent -c conf -f conf/exec-kafka.conf -name a1
* kafka
* bin/kafka-console-consumer.sh --topic test-fk --zookeeper master:2181,slave01:2181,slave02:2181
* Kafka和storm的整合---->消费数据
* 用storm去消费kafka中某一个topic的数据,那么storm中的spout是谁?kafka中的某一个topic
* storm和redis的整合---->数据落地
* 用我们之前学习过的JedisUtil
* 可以将redis替换为hbase、mysql
*/
public class StatisticsPVAndUVTopology {
public static void main(String[] args) throws Exception {
/**
* BrokerHosts hosts ---->storm要连接的kafka的对应的zk列表
* String topic ---->storm要消费的kafka的topic
* String zkRoot ---->storm在kafka消费的过程中需要在zk中设置一个工作目录
* String id ---->storm在kafka中消费过程中生产一个标识ID
*/
String zkStr = "master:2181,slave01:2181,slave02:2181";
BrokerHosts hosts = new ZkHosts(zkStr);
String topic = "test-fk";
String zkRoot = "/storm-kafka-test";
String id = "storm_kafka_test_id";
SpoutConfig spoutConf = new SpoutConfig(hosts, topic, zkRoot, id);
TopologyBuilder builder = new TopologyBuilder();
//设置topology中的spout
builder.setSpout("kafka_spout_id", new KafkaSpout(spoutConf));
//设置bolt
builder.setBolt("split_bolt_id", new SplitBolt()).shuffleGrouping("kafka_spout_id");
builder.setBolt("static_bolt_id", new StaticBolt()).shuffleGrouping("split_bolt_id");
//启动topology
StormTopology stormTopology = builder.createTopology();
String topologyName = StatisticsPVAndUVTopology.class.getSimpleName();
Config config = new Config();
if(args == null || args.length < 1) {
LocalCluster cluster = new LocalCluster();
cluster.submitTopology(topologyName, config, stormTopology);
}else {
StormSubmitter.submitTopology(topologyName, config, stormTopology);
}
}
//获取spout从kafka中提取的数据,进行操作
//数据格式:27.19.74.143##2016-05-30 17:38:20##GET /static/image/common/faq.gif HTTP/1.1##200##1127
static class SplitBolt extends BaseRichBolt {
private Map conf;
private TopologyContext context;
private OutputCollector collector;
public void prepare(Map conf, TopologyContext context, OutputCollector collector) {
this.conf = conf;
this.context = context;
this.collector = collector;
}
/**
* java.lang.ClassCastException: [B cannot be cast to java.lang.String
* 数据在kafka中直接存储的是字节数组,不能使用字符串进行接收
* @param tuple
*/
public void execute(Tuple tuple) {
String line = new String(tuple.getBinary(0));
String[] items = line.split("##");
if(items == null || items.length < 2) {
return;
}
System.out.println("kafka中的数据:ip--->" + items[0] + ", time--->" + items[1]);
this.collector.emit(new Values(items[0], items[1]));
}
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("ip", "time"));
}
}
/**
* 分时段统计PV和UV
* 接收到的数据:
* ip--->27.19.74.143
* time->2016-05-30 17:38:20
*
* 我们提取出时段,每一个时段都对应了一个ip地址的集合
* 我们可以将时段作为key,ip地址对应的集合作为一个value
* Map<String, List<String>>;
*
* key--->list的大小(PV),---->UV list====>set--->set.size-->UV
*
*
*
*/
static class StaticBolt extends BaseRichBolt {
private Map conf;
private TopologyContext context;
private OutputCollector collector;
public void prepare(Map conf, TopologyContext context, OutputCollector collector) {
this.conf = conf;
this.context = context;
this.collector = collector;
}
Map<String, List<String>> map = new HashMap<String, List<String>>();
List<String> list = null;
Set<String> set = null;
@Override
public void execute(Tuple input) {
String hour = null;
if(!input.getSourceComponent().equalsIgnoreCase(Constants.SYSTEM_COMPONENT_ID)) {
String ip = input.getStringByField("ip");
String timeStr = input.getStringByField("time");
hour = timeStr.substring(11, 13);
list = map.get(hour);//key一定是时间段hour,不是ip
if (list == null) {
System.out.println("-----1111-----");
list = new ArrayList<>();
}
list.add(ip);
map.put(hour, list);//每一个时段所对应的ip地址的集合
} else {
//求出PU和UV
System.out.println("===============start================");
System.out.println(map);
System.out.println("------------------------------------");
Jedis jedis = JedisUtils.getJedis();
for (Map.Entry<String, List<String>> me : map.entrySet()) {
hour = me.getKey();
list = me.getValue();
int pv = list.size();
set = new HashSet<String>(list);
int uv = set.size();
// System.out.println("hour: " + hour + ",pv: " + pv + ",uv:" + uv);
//将数据写到redis
jedis.hset("pv_uv_" + hour, "pv", pv + "");
jedis.hset("pv_uv_" + hour, "uv", uv + "");
}
JedisUtils.close(jedis);
System.out.println("===============end=================");
}
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
}
@Override
public Map<String, Object> getComponentConfiguration() {
Map<String, Object> conf = new HashMap<>();
conf.put(Config.TOPOLOGY_TICK_TUPLE_FREQ_SECS, 10);
return conf;
}
}
}