进行业务计算:pv、uv、vv、newip、newcust
pv
package com.liming.flux;
import java.util.UUID;
import backtype.storm.Config;
import backtype.storm.LocalCluster;
import backtype.storm.generated.StormTopology;
import backtype.storm.spout.SchemeAsMultiScheme;
import backtype.storm.topology.TopologyBuilder;
import backtype.storm.utils.Utils;
import storm.kafka.BrokerHosts;
import storm.kafka.KafkaSpout;
import storm.kafka.SpoutConfig;
import storm.kafka.StringScheme;
import storm.kafka.ZkHosts;
public class FluxTopology {
public static void main(String[] args) {
//SPOUT的id 要求唯一
String KAFKA_SPOUT_ID = "flux_spout";
//要连接的kafka的topic
String CONSUME_TOPIC = "flux_topic";
//要连接的zookeeper的地址
String ZK_HOSTS = "192.168.239.129:2181";
//设定连接服务器的参数
BrokerHosts hosts = new ZkHosts(ZK_HOSTS);
SpoutConfig spoutConfig = new SpoutConfig(hosts, CONSUME_TOPIC, "/" + CONSUME_TOPIC, UUID.randomUUID().toString());
spoutConfig.scheme = new SchemeAsMultiScheme(new StringScheme());
//从kafka读取数据发射
KafkaSpout kafkaSpout = new KafkaSpout(spoutConfig);
//创建TopologyBuilder类实例
TopologyBuilder builder = new TopologyBuilder();
builder.setSpout(KAFKA_SPOUT_ID, kafkaSpout);
//清理数据
builder.setBolt("ClearBolt", new ClearBolt()).shuffleGrouping(KAFKA_SPOUT_ID);
//计算PV
builder.setBolt("PvBolt", new PvBolt()).shuffleGrouping("ClearBolt");
builder.setBolt("PrintBolt", new PrintBolt()).shuffleGrouping("PvBolt");
StormTopology topology = builder.createTopology();
//--提交Topology给集群运行
Config conf = new Config();
LocalCluster cluster = new LocalCluster();
cluster.submitTopology("MyTopology", conf, topology);
//--运行10秒钟后杀死Topology关闭集群
Utils.sleep(1000 * 1000);
cluster.killTopology("MyTopology");
cluster.shutdown();
}
}
package com.liming.flux;
import java.util.Date;
import java.util.Map;
import com.liming.flux.utils.FluxUtils;
import backtype.storm.task.OutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseRichBolt;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Tuple;
import backtype.storm.tuple.Values;
public class ClearBolt extends BaseRichBolt{
private OutputCollector collector;
@Override
public void prepare(Map map, TopologyContext topology, OutputCollector collector) {
this.collector = collector;
}
@Override
public void execute(Tuple input) {
String string = input.getStringByField("str");
String[] values = string.split("\\|");
String uv_id = values[13];
String ss_id = values[14].split("_")[0];
String ss_time = values[14].split("_")[2];
String urlname = values[1];
String cip = values[15];
Date date = new Date(Long.parseLong(ss_time));
String time = FluxUtils.formatDate(date);
collector.emit(new Values(time,uv_id,ss_id,ss_time,urlname,cip));
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("time","uv_id","ss_id","ss_time","urlname","cip"));
}
}
package com.liming.flux;
import java.util.Map;
import backtype.storm.task.OutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseRichBolt;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Tuple;
import backtype.storm.tuple.Values;
public class PvBolt extends BaseRichBolt{
private OutputCollector collector;
@Override
public void prepare(Map map, TopologyContext topology, OutputCollector collector) {
this.collector = collector;
}
@Override
public void execute(Tuple input) {
collector.emit(new Values(1));//每来一条数据就记1向后传递数据
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("pv"));
}
}
package com.liming.flux;
import java.util.Iterator;
import java.util.Map;
import backtype.storm.task.OutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseRichBolt;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Tuple;
public class PrintBolt extends BaseRichBolt{
private OutputCollector collector = null;
@Override
public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
this.collector = collector;
}
@Override
public void execute(Tuple input) {
Fields fields = input.getFields();
Iterator<String> iterator = fields.iterator();
StringBuffer stringBuffer = new StringBuffer();
while(iterator.hasNext()){
String key = iterator.next();
Object value = input.getValueByField(key);
stringBuffer.append("------"+key+":"+value+"------");
}
System.out.println(stringBuffer.toString());
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
}
}
以下证明处理pv成功
uv
1.方案分析
计算uv时,有两种方法可以选择,本例采用了方法一
方法一:串行处理
这种方法在PvBolt的输出中需要再带上原始数据这样才能继续计算Uv
方法二:并行处理
这种方法就是每个业务各自计算
2.计算uv
(1)注意:在PvBolt的输出中除了pv外将原始的数据也带上
(2)如果uv_id在今天的其他数据中出现过,输出为0,输出为1,需要有中间数据做参考,而storm中只有当前数据没有历史数据,所以需要有一个机制来存放中间数据。
可选的方案:
存储到本地文件系统:直接pass,因为有更多的选择可采取
直接在内存中设置map来存储:如果做扩展并发的话,每个map中只存储了部分数据,所以不合适
HDFS:存储海量数据,高吞吐量(一旦开始持续读,速度很快),不是低延时,而本例实时系统需要低延时,所以不合适
KAFKA:是队列,适合一个一个取数据,本例需要一次性取多个数据,所以不合适
redis:是缓存,数据存储到内存中,但达到了某个阈值后会落地,这样影响查询速度违背低延时,如果是少量数据的话可以选用,但本例是海量数据,所以不合适
mysql:关系型数据库,如果要存储非关系型数据就不适用了,所以不合适
hbase:非关系型数据库(可以存储非关系型数据),查询速度快(低延时,PB级别的数据都不会影响性能要胜于redis),可扩展(可存海量数据),合适
在hbase中创建flux表:
由于需要在程序中操作hbase所以要导入相关jar包
(3)编写代码
要将中间数据存入hbase中:
builder.setBolt("ToHbaseBolt", new ToHbaseBolt()).shuffleGrouping("UvBolt");
package com.liming.flux;
import java.util.UUID;
import backtype.storm.Config;
import backtype.storm.LocalCluster;
import backtype.storm.generated.StormTopology;
import backtype.storm.spout.SchemeAsMultiScheme;
import backtype.storm.topology.TopologyBuilder;
import backtype.storm.utils.Utils;
import storm.kafka.BrokerHosts;
import storm.kafka.KafkaSpout;
import storm.kafka.SpoutConfig;
import storm.kafka.StringScheme;
import storm.kafka.ZkHosts;
public class FluxTopology {
public static void main(String[] args) {
//SPOUT的id 要求唯一
String KAFKA_SPOUT_ID = "flux_spout";
//要连接的kafka的topic
String CONSUME_TOPIC = "flux_topic";
//要连接的zookeeper的地址
String ZK_HOSTS = "192.168.239.129:2181";
//设定连接服务器的参数
BrokerHosts hosts = new ZkHosts(ZK_HOSTS);
SpoutConfig spoutConfig = new SpoutConfig(hosts, CONSUME_TOPIC, "/" + CONSUME_TOPIC, UUID.randomUUID().toString());
spoutConfig.scheme = new SchemeAsMultiScheme(new StringScheme());
//从kafka读取数据发射
KafkaSpout kafkaSpout = new KafkaSpout(spoutConfig);
//创建TopologyBuilder类实例
TopologyBuilder builder = new TopologyBuilder();
builder.setSpout(KAFKA_SPOUT_ID, kafkaSpout);
//清理数据
builder.setBolt("ClearBolt", new ClearBolt()).shuffleGrouping(KAFKA_SPOUT_ID);
//计算PV
builder.setBolt("PvBolt", new PvBolt()).shuffleGrouping("ClearBolt");
//计算Uv
builder.setBolt("UvBolt", new UvBolt()).shuffleGrouping("PvBolt");
builder.setBolt("PrintBolt", new PrintBolt()).shuffleGrouping("UvBolt");
builder.setBolt("ToHbaseBolt", new ToHbaseBolt()).shuffleGrouping("UvBolt");
StormTopology topology = builder.createTopology();
//--提交Topology给集群运行
Config conf = new Config();
LocalCluster cluster = new LocalCluster();
cluster.submitTopology("MyTopology", conf, topology);
//--运行10秒钟后杀死Topology关闭集群
Utils.sleep(1000 * 1000);
cluster.killTopology("MyTopology");
cluster.shutdown();
}
}
package com.liming.flux;
import java.util.Date;
import java.util.Map;
import com.liming.flux.utils.FluxUtils;
import backtype.storm.task.OutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseRichBolt;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Tuple;
import backtype.storm.tuple.Values;
public class ClearBolt extends BaseRichBolt{
private OutputCollector collector;
@Override
public void prepare(Map map, TopologyContext topology, OutputCollector collector) {
this.collector = collector;
}
@Override
public void execute(Tuple input) {
String string = input.getStringByField("str");
String[] values = string.split("\\|");
String uv_id = values[13];
String ss_id = values[14].split("_")[0];
String ss_time = values[14].split("_")[2];
String urlname = values[1];
String cip = values[15];
Date date = new Date(Long.parseLong(ss_time));
String time = FluxUtils.formatDate(date);
collector.emit(new Values(time,uv_id,ss_id,ss_time,urlname,cip));
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("time","uv_id","ss_id","ss_time","urlname","cip"));
}
}
public class PvBolt extends BaseRichBolt{
private OutputCollector collector;
@Override
public void prepare(Map map, TopologyContext topology, OutputCollector collector) {
this.collector = collector;
}
@Override
public void execute(Tuple input) {
List<Object> values = input.getValues();
values.add(1);
collector.emit(new Values(values.toArray()));//每来一条数据就记1向后传递数据
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("time","uv_id","ss_id","ss_time","urlname","cip","pv"));
}
}
通过行键查询hbase中符合的数据
package com.liming.flux;
import java.util.List;
import java.util.Map;
import com.liming.flux.dao.HBaseDao;
import com.liming.flux.domain.FluxInfo;
import backtype.storm.task.OutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.IRichBolt;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseRichBolt;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Tuple;
import backtype.storm.tuple.Values;
public class UvBolt extends BaseRichBolt{
private OutputCollector collector;
@Override
public void prepare(Map map, TopologyContext topology, OutputCollector collector) {
this.collector = collector;
}
@Override
public void execute(Tuple input) {
List<Object> values = input.getValues();
//如果uv_id在今天的其他数据中没有出现过,则输出1,否则输出0
List<FluxInfo> list = HBaseDao.queryData("^"+input.getStringByField("time")+"_"+input.getStringByField("uv_id")+"_.*$");
values.add(list.size() == 0 ? 1 : 0);
collector.emit(new Values(values.toArray()));
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("time","uv_id","ss_id","ss_time","urlname","cip","pv","uv"));
}
}
package com.liming.flux;
import java.util.Iterator;
import java.util.Map;
import backtype.storm.task.OutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseRichBolt;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Tuple;
public class PrintBolt extends BaseRichBolt{
private OutputCollector collector = null;
@Override
public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
this.collector = collector;
}
@Override
public void execute(Tuple input) {
Fields fields = input.getFields();
Iterator<String> iterator = fields.iterator();
StringBuffer stringBuffer = new StringBuffer();
while(iterator.hasNext()){
String key = iterator.next();
Object value = input.getValueByField(key);
stringBuffer.append("------"+key+":"+value+"------");
}
System.out.println(stringBuffer.toString());
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
}
}
编写将中间数据存入到hbase
package com.liming.flux;
import java.util.Map;
import com.liming.flux.dao.HBaseDao;
import com.liming.flux.domain.FluxInfo;
import backtype.storm.task.OutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseRichBolt;
import backtype.storm.tuple.Tuple;
public class ToHbaseBolt extends BaseRichBolt {
@Override
public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
}
@Override
public void execute(Tuple input) {
FluxInfo fi = new FluxInfo();
fi.setTime(input.getStringByField("time"));
fi.setUv_id(input.getStringByField("uv_id"));
fi.setSs_id(input.getStringByField("ss_id"));
fi.setSs_time(input.getStringByField("ss_time"));
fi.setUrlname(input.getStringByField("urlname"));
fi.setCip(input.getStringByField("cip"));
HBaseDao.insertData(fi);
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
}
}
工具类:
package com.liming.flux.utils;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
public class FluxUtils {
private static SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd");
public static String formatDate(Date date){
return format.format(date);
}
public static Date parseDateStr(String dateStr){
try {
return format.parse(dateStr);
} catch (ParseException e) {
e.printStackTrace();
throw new RuntimeException(e);
}
}
public static long randNum(int len){
double num = Math.random();
return Math.round(num * Math.pow(10, len));
}
}
操作hbase的api:
行键的设计:
package com.liming.flux.dao;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.filter.CompareFilter;
import org.apache.hadoop.hbase.filter.RegexStringComparator;
import org.apache.hadoop.hbase.filter.RowFilter;
import org.apache.hadoop.hbase.util.Bytes;
import com.liming.flux.domain.FluxInfo;
import com.liming.flux.utils.FluxUtils;
public class HBaseDao {
private HBaseDao() {
}
public static List<FluxInfo> queryData(String reg){
try {
Configuration conf = HBaseConfiguration.create();
conf.set("hbase.zookeeper.quorum","hadoop01:2181,hadoop02:2181,hadoop03:2181");
HTable table = new HTable(conf,"flux");
List<FluxInfo> retuList = new ArrayList();
Scan scan = new Scan();
scan.setFilter(new RowFilter(CompareFilter.CompareOp.EQUAL,new RegexStringComparator(reg)));
ResultScanner rs = table.getScanner(scan);
Result r = null;
while((r = rs.next()) != null){
String time = new String(r.getValue("cf1".getBytes(), "time".getBytes()));
String uv_id = new String(r.getValue("cf1".getBytes(), "uv_id".getBytes()));
String ss_id = new String(r.getValue("cf1".getBytes(), "ss_id".getBytes()));
String ss_time = new String(r.getValue("cf1".getBytes(), "ss_time".getBytes()));
String urlname = new String(r.getValue("cf1".getBytes(), "urlname".getBytes()));
String cip = new String(r.getValue("cf1".getBytes(), "cip".getBytes()));
FluxInfo fi = new FluxInfo(time, uv_id, ss_id, ss_time, urlname, cip);
retuList.add(fi);
}
return retuList;
} catch (Exception e) {
e.printStackTrace();
throw new RuntimeException(e);
}
}
public static void insertData(FluxInfo fi){
try {
Configuration conf = HBaseConfiguration.create();
conf.set("hbase.zookeeper.quorum","hadoop01:2181,hadoop02:2181,hadoop03:2181");
HTable table = new HTable(conf,"flux");
Put put = new Put(Bytes.toBytes(fi.getTime()+"_"+fi.getUv_id()+"_"+fi.getSs_id()+"_"+fi.getCip()+"_"+FluxUtils.randNum(8)));
put.add(Bytes.toBytes("cf1"),Bytes.toBytes("time"),Bytes.toBytes(fi.getTime()));
put.add(Bytes.toBytes("cf1"),Bytes.toBytes("uv_id"),Bytes.toBytes(fi.getUv_id()));
put.add(Bytes.toBytes("cf1"),Bytes.toBytes("ss_id"),Bytes.toBytes(fi.getSs_id()));
put.add(Bytes.toBytes("cf1"),Bytes.toBytes("ss_time"),Bytes.toBytes(fi.getSs_time()));
put.add(Bytes.toBytes("cf1"),Bytes.toBytes("urlname"),Bytes.toBytes(fi.getUrlname()));
put.add(Bytes.toBytes("cf1"),Bytes.toBytes("cip"),Bytes.toBytes(fi.getCip()));
table.put(put);
table.close();
} catch (Exception e) {
e.printStackTrace();
throw new RuntimeException(e);
}
}
}
传输的实体类
package com.liming.flux.domain;
public class FluxInfo {
private String time;
private String uv_id;
private String ss_id;
private String ss_time;
private String urlname;
private String cip;
public FluxInfo() {
}
public FluxInfo(String time, String uv_id, String ss_id, String ss_time, String urlname, String cip) {
this.time = time;
this.uv_id = uv_id;
this.ss_id = ss_id;
this.ss_time = ss_time;
this.urlname = urlname;
this.cip = cip;
}
public String getTime() {
return time;
}
public void setTime(String time) {
this.time = time;
}
public String getUv_id() {
return uv_id;
}
public void setUv_id(String uv_id) {
this.uv_id = uv_id;
}
public String getSs_id() {
return ss_id;
}
public void setSs_id(String ss_id) {
this.ss_id = ss_id;
}
public String getSs_time() {
return ss_time;
}
public void setSs_time(String ss_time) {
this.ss_time = ss_time;
}
public String getUrlname() {
return urlname;
}
public void setUrlname(String urlname) {
this.urlname = urlname;
}
public String getCip() {
return cip;
}
public void setCip(String cip) {
this.cip = cip;
}
}
启动时遇到的问题:因为导入hbase的jar包时log4j的包又冲突了,删除多余的即可
SLF4J: Detected both log4j-over-slf4j.jar AND slf4j-log4j12.jar on the class path, preempting StackOverflowError.
SLF4J: See also http://www.slf4j.org/codes.html#log4jDelegationLoop for more details.
7786 [Thread-18-flux_spout] ERROR backtype.storm.util - Async loop died!
java.lang.NoClassDefFoundError: Could not initialize class org.apache.log4j.Log4jLoggerFactory
at org.apache.log4j.Logger.getLogger(Logger.java:39) ~[log4j-over-slf4j-1.6.6.jar:1.6.6]
at kafka.utils.Logging$class.logger(Logging.scala:24) ~[kafka_2.10-0.10.0.1.jar:na]
at kafka.consumer.SimpleConsumer.logger$lzycompute(SimpleConsumer.scala:35) ~[kafka_2.10-0.10.0.1.jar:na]
at kafka.consumer.SimpleConsumer.logger(SimpleConsumer.scala:35) ~[kafka_2.10-0.10.0.1.jar:na]
at kafka.utils.Logging$class.info(Logging.scala:75) ~[kafka_2.10-0.10.0.1.jar:na]
at kafka.consumer.SimpleConsumer.info(SimpleConsumer.scala:35) ~[kafka_2.10-0.10.0.1.jar:na]
at kafka.consumer.SimpleConsumer.liftedTree1$1(SimpleConsumer.scala:94) ~[kafka_2.10-0.10.0.1.jar:na]
at kafka.consumer.SimpleConsumer.kafka$consumer$SimpleConsumer$$sendRequest(SimpleConsumer.scala:83) ~[kafka_2.10-0.10.0.1.jar:na]
at kafka.consumer.SimpleConsumer.getOffsetsBefore(SimpleConsumer.scala:149) ~[kafka_2.10-0.10.0.1.jar:na]
at kafka.javaapi.consumer.SimpleConsumer.getOffsetsBefore(SimpleConsumer.scala:79) ~[kafka_2.10-0.10.0.1.jar:na]
at storm.kafka.KafkaUtils.getOffset(KafkaUtils.java:77) ~[storm-kafka-0.9.3.jar:0.9.3]
at storm.kafka.KafkaUtils.getOffset(KafkaUtils.java:67) ~[storm-kafka-0.9.3.jar:0.9.3]
at storm.kafka.PartitionManager.<init>(PartitionManager.java:83) ~[storm-kafka-0.9.3.jar:0.9.3]
at storm.kafka.ZkCoordinator.refresh(ZkCoordinator.java:98) ~[storm-kafka-0.9.3.jar:0.9.3]
at storm.kafka.ZkCoordinator.getMyManagedPartitions(ZkCoordinator.java:69) ~[storm-kafka-0.9.3.jar:0.9.3]
at storm.kafka.KafkaSpout.nextTuple(KafkaSpout.java:135) ~[storm-kafka-0.9.3.jar:0.9.3]
at backtype.storm.daemon.executor$fn__3373$fn__3388$fn__3417.invoke(executor.clj:565) ~[storm-core-0.9.3.jar:0.9.3]
at backtype.storm.util$async_loop$fn__464.invoke(util.clj:463) ~[storm-core-0.9.3.jar:0.9.3]
at clojure.lang.AFn.run(AFn.java:24) [clojure-1.5.1.jar:na]
at java.lang.Thread.run(Unknown Source) [na:1.7.0_79]
7787 [Thread-18-flux_spout] ERROR backtype.storm.daemo
测试:访问第一次是uv为1,访问第二三次时,由于中间数据有了uv所以值为0