0.将埋点收到的数据通过flume导入到kafka中
a.配置flume的配置文件
配置文件有两种:第一种是需要导入插件的,第二种是使用flume原生的。以下配置是第二种
#agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1
#Source
a1.sources.r1.type = avro
a1.sources.r1.bind = 0.0.0.0
a1.sources.r1.port = 44444
#Sink
a1.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.k1.topic = flux_topic
a1.sinks.k1.brokerList = hadoop01:9092
a1.sinks.k1.requiredAcks = 1
a1.sinks.k1.batchSize = 100
#Channel
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
#ΪChannle°ó¶¨SourceºÍSink
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
如果要使用第一种配置文件可参考如下资料:
flume配置文件错误集合:点击打开链接
log4j对接flume错误集合:点击打开链接
flume要将日志导入到kafka需要添加插件(参考)
将这些jar包导入到linux下flume安装的目录下的lib文件夹下:所需要的jar包
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
b.验证通过flume将数据导入到了kafka中:
访问一次a.jsp页面:
日志收集端:
flume中:
kafka的消费者监听:
数据已经导入到了kafka中
1.分析指标那些需要实时处理,那些需要离线处理
pv : 离线处理 实时处理
uv: 离线处理 实时处理
vv : 离线处理 实时处理
br : 离线处理 不适用(但也可以做到)
newip: 离线处理 实时处理
avgtime : 离线处理 不适用(但也可以做到)
newcust: 离线处理 实时处理
viewdeep: 离线处理 不适用(但也可以做到)
br、avgtime、viewdeep可以一个小时触发一次storm的流程来计算以达到实时处理的需求
2.使用storm来进行pv、uv、vv、newip、newcust实时处理
步骤:(1)从kafka中获取数据:开发spout从kafka中获取数据向后喷发数据
(2)进行计算(中间数据的存储问题)
(3)最终数据的存储
(1)编写程序从kafka中获取数据
a.新建项目
b.导入jar包
导入storm所需的jar包
导入链接kafka的jar包
导入kafka所需要的jar包
导入其他的jar包
资源:其他jar包
c.编写程序
storm链接kafka有两种方式:(1)通过trident链接(2)通过普通的方式链接。此处采用普通的方法链接
package com.liming.flux;
import java.util.UUID;
import backtype.storm.Config;
import backtype.storm.LocalCluster;
import backtype.storm.generated.StormTopology;
import backtype.storm.spout.SchemeAsMultiScheme;
import backtype.storm.topology.TopologyBuilder;
import backtype.storm.utils.Utils;
import storm.kafka.BrokerHosts;
import storm.kafka.KafkaSpout;
import storm.kafka.SpoutConfig;
import storm.kafka.StringScheme;
import storm.kafka.ZkHosts;
public class FluxTopology {
public static void main(String[] args) {
//SPOUT的id 要求唯一
String KAFKA_SPOUT_ID = "flux_spout";
//要连接的kafka的topic
String CONSUME_TOPIC = "flux_topic";
//要连接的zookeeper的地址
String ZK_HOSTS = "192.168.239.129:2181";
//设定连接服务器的参数
BrokerHosts hosts = new ZkHosts(ZK_HOSTS);
SpoutConfig spoutConfig = new SpoutConfig(hosts, CONSUME_TOPIC, "/" + CONSUME_TOPIC, UUID.randomUUID().toString());
spoutConfig.scheme = new SchemeAsMultiScheme(new StringScheme());
KafkaSpout kafkaSpout = new KafkaSpout(spoutConfig);
//--创建TopologyBuilder类实例
TopologyBuilder builder = new TopologyBuilder();
builder.setSpout(KAFKA_SPOUT_ID, kafkaSpout);
builder.setBolt("printBolt", new PrintBolt()).shuffleGrouping(KAFKA_SPOUT_ID);
StormTopology topology = builder.createTopology();
//--提交Topology给集群运行
Config conf = new Config();
LocalCluster cluster = new LocalCluster();
cluster.submitTopology("MyTopology", conf, topology);
//--运行10秒钟后杀死Topology关闭集群
Utils.sleep(1000 * 1000);
cluster.killTopology("MyTopology");
cluster.shutdown();
}
}
package com.liming.flux;
import java.util.Iterator;
import java.util.Map;
import backtype.storm.task.OutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseRichBolt;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Tuple;
public class PrintBolt extends BaseRichBolt{
private OutputCollector collector = null;
@Override
public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
this.collector = collector;
}
@Override
public void execute(Tuple input) {
String stringByField = input.getStringByField("str");//kafka生成的input的键都是str,直接写即可
System.out.println(stringByField);
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("time","uv_id","ss_id","ss_time","urlname","cip"));
}
}
发现报错:无法初始化log4j
SLF4J: Detected both log4j-over-slf4j.jar AND bound slf4j-log4j12.jar on the class path, preempting StackOverflowError.
SLF4J: See also http://www.slf4j.org/codes.html#log4jDelegationLoop for more details.
8351 [Thread-8-flux_spout] ERROR backtype.storm.util - Async loop died!
java.lang.NoClassDefFoundError: Could not initialize class org.apache.log4j.Log4jLoggerFactory
at org.apache.log4j.Logger.getLogger(Logger.java:39) ~[log4j-over-slf4j-1.6.6.jar:1.6.6]
at kafka.utils.Logging$class.logger(Logging.scala:24) ~[kafka_2.10-0.10.0.1.jar:na]
at kafka.consumer.SimpleConsumer.logger$lzycompute(SimpleConsumer.scala:35) ~[kafka_2.10-0.10.0.1.jar:na]
at kafka.consumer.SimpleConsumer.logger(SimpleConsumer.scala:35) ~[kafka_2.10-0.10.0.1.jar:na]
at kafka.utils.Logging$class.info(Logging.scala:75) ~[kafka_2.10-0.10.0.1.jar:na]
at kafka.consumer.SimpleConsumer.info(SimpleConsumer.scala:35) ~[kafka_2.10-0.10.0.1.jar:na]
at kafka.consumer.SimpleConsumer.liftedTree1$1(SimpleConsumer.scala:94) ~[kafka_2.10-0.10.0.1.jar:na]
at kafka.consumer.SimpleConsumer.kafka$consumer$SimpleConsumer$$sendRequest(SimpleConsumer.scala:83) ~[kafka_2.10-0.10.0.1.jar:na]
at kafka.consumer.SimpleConsumer.getOffsetsBefore(SimpleConsumer.scala:149) ~[kafka_2.10-0.10.0.1.jar:na]
at kafka.javaapi.consumer.SimpleConsumer.getOffsetsBefore(SimpleConsumer.scala:79) ~[kafka_2.10-0.10.0.1.jar:na]
at storm.kafka.KafkaUtils.getOffset(KafkaUtils.java:77) ~[storm-kafka-0.9.3.jar:0.9.3]
at storm.kafka.KafkaUtils.getOffset(KafkaUtils.java:67) ~[storm-kafka-0.9.3.jar:0.9.3]
at storm.kafka.PartitionManager.<init>(PartitionManager.java:83) ~[storm-kafka-0.9.3.jar:0.9.3]
at storm.kafka.ZkCoordinator.refresh(ZkCoordinator.java:98) ~[storm-kafka-0.9.3.jar:0.9.3]
at storm.kafka.ZkCoordinator.getMyManagedPartitions(ZkCoordinator.java:69) ~[storm-kafka-0.9.3.jar:0.9.3]
at storm.kafka.KafkaSpout.nextTuple(KafkaSpout.java:135) ~[storm-kafka-0.9.3.jar:0.9.3]
at backtype.storm.daemon.executor$fn__3373$fn__3388$fn__3417.invoke(executor.clj:565) ~[storm-core-0.9.3.jar:0.9.3]
at backtype.storm.util$async_loop$fn__464.invoke(util.clj:463) ~[storm-core-0.9.3.jar:0.9.3]
at clojure.lang.AFn.run(AFn.java:24) [clojure-1.5.1.jar:na]
at java.lang.Thread.run(Unknown Source) [na:1.7.0_79]
8351 [Thread-8-flux_spout] ERROR backtype.storm.daemon.executor -
java.lang.NoClassDefFoundError: Could not initialize class org.apache.log4j.Log4jLoggerFactory
at org.apache.log4j.Logger.getLogger(Logger.java:39) ~[log4j-over-slf4j-1.6.6.jar:1.6.6]
at kafka.utils.Logging$class.logger(Logging.scala:24) ~[kafka_2.10-0.10.0.1.jar:na]
at kafka.consumer.SimpleConsumer.logger$lzycompute(SimpleConsumer.scala:35) ~[kafka_2.10-0.10.0.1.jar:na]
at kafka.consumer.SimpleConsumer.logger(SimpleConsumer.scala:35) ~[kafka_2.10-0.10.0.1.jar:na]
at kafka.utils.Logging$class.info(Logging.scala:75) ~[kafka_2.10-0.10.0.1.jar:na]
at kafka.consumer.SimpleConsumer.info(SimpleConsumer.scala:35) ~[kafka_2.10-0.10.0.1.jar:na]
at kafka.consumer.SimpleConsumer.liftedTree1$1(SimpleConsumer.scala:94) ~[kafka_2.10-0.10.0.1.jar:na]
at kafka.consumer.SimpleConsumer.kafka$consumer$SimpleConsumer$$sendRequest(SimpleConsumer.scala:83) ~[kafka_2.10-0.10.0.1.jar:na]
at kafka.consumer.SimpleConsumer.getOffsetsBefore(SimpleConsumer.scala:149) ~[kafka_2.10-0.10.0.1.jar:na]
at kafka.javaapi.consumer.SimpleConsumer.getOffsetsBefore(SimpleConsumer.scala:79) ~[kafka_2.10-0.10.0.1.jar:na]
at storm.kafka.KafkaUtils.getOffset(KafkaUtils.java:77) ~[storm-kafka-0.9.3.jar:0.9.3]
at storm.kafka.KafkaUtils.getOffset(KafkaUtils.java:67) ~[storm-kafka-0.9.3.jar:0.9.3]
at storm.kafka.PartitionManager.<init>(PartitionManager.java:83) ~[storm-kafka-0.9.3.jar:0.9.3]
at storm.kafka.ZkCoordinator.refresh(ZkCoordinator.java:98) ~[storm-kafka-0.9.3.jar:0.9.3]
at storm.kafka.ZkCoordinator.getMyManagedPartitions(ZkCoordinator.java:69) ~[storm-kafka-0.9.3.jar:0.9.3]
at storm.kafka.KafkaSpout.nextTuple(KafkaSpout.java:135) ~[storm-kafka-0.9.3.jar:0.9.3]
at backtype.storm.daemon.executor$fn__3373$fn__3388$fn__3417.invoke(executor.clj:565) ~[storm-core-0.9.3.jar:0.9.3]
at backtype.storm.util$async_loop$fn__464.invoke(util.clj:463) ~[storm-core-0.9.3.jar:0.9.3]
at clojure.lang.AFn.run(AFn.java:24) [clojure-1.5.1.jar:na]
at java.lang.Thread.run(Unknown Source) [na:1.7.0_79]
8671 [Thread-8-flux_spout] ERROR backtype.storm.util - Halting process: ("Worker died")
java.lang.RuntimeException: ("Worker died")
at backtype.storm.util$exit_process_BANG_.doInvoke(util.clj:325) [storm-core-0.9.3.jar:0.9.3]
at clojure.lang.RestFn.invoke(RestFn.java:423) [clojure-1.5.1.jar:na]
at backtype.storm.daemon.worker$fn__3808$fn__3809.invoke(worker.clj:452) [storm-core-0.9.3.jar:0.9.3]
at backtype.storm.daemon.executor$mk_executor_data$fn__3274$fn__3275.invoke(executor.clj:240) [storm-core-0.9.3.jar:0.9.3]
at backtype.storm.util$async_loop$fn__464.invoke(util.clj:473) [storm-core-0.9.3.jar:0.9.3]
at clojure.lang.AFn.run(AFn.java:24) [clojure-1.5.1.jar:na]
at java.lang.Thread.run(Unknown Source) [na:1.7.0_79]
分析原因:应为该工程中导入了kafka、storm、storm-kafka和other包,其中有重复的log4j包所以报错,将多余的删除即可
解决办法:删除多有的log4j包
修改之后,通过自己编写的spout拿到了日志,并且喷发到了printBolt中打印到了控制台上
(2)进行计算(中间数据的存储问题)
0.每个统计值所需要的源数据分析
pv:
time
uv:
time uv_id
vv:
time ss_id
newip:
time cip
newcust:
time uv_id
br:
time ss_id urlname
avgtime:
time ss_id ss_time
avgdeep:
time ss_id urlname
需要的源数据信息为:time、ss_id、cip、uv_id、urlname
1.编写CleanBolt从源日志中截取所需的数据信息
package com.liming.flux;
import java.util.Date;
import java.util.Map;
import com.liming.flux.utils.FluxUtils;
import backtype.storm.task.OutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseRichBolt;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Tuple;
import backtype.storm.tuple.Values;
public class ClearBolt extends BaseRichBolt{
private OutputCollector collector;
@Override
public void prepare(Map map, TopologyContext topology, OutputCollector collector) {
this.collector = collector;
}
@Override
public void execute(Tuple input) {
String string = input.getStringByField("str");
String[] values = string.split("\\|");
String uv_id = values[13];
String ss_id = values[14].split("_")[0];
String ss_time = values[14].split("_")[2];
String urlname = values[1];
String cip = values[15];
Date date = new Date(Long.parseLong(ss_time));
String time = FluxUtils.formatDate(date);
collector.emit(new Values(time,uv_id,ss_id,ss_time,urlname,cip));
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("time","uv_id","ss_id","ss_time","urlname","cip"));
}
}
package com.liming.flux;
import java.util.Iterator;
import java.util.Map;
import backtype.storm.task.OutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseRichBolt;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Tuple;
public class PrintBolt extends BaseRichBolt{
private OutputCollector collector = null;
@Override
public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
this.collector = collector;
}
@Override
public void execute(Tuple input) {
Fields fields = input.getFields();
Iterator<String> iterator = fields.iterator();
while(iterator.hasNext()){
String key = iterator.next();
String value = input.getStringByField(key);
System.out.println(key+":"+value);
}
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
}
}
package com.liming.flux.utils;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
public class FluxUtils {
private static SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd");
public static String formatDate(Date date){
return format.format(date);
}
}
package com.liming.flux;
import java.util.UUID;
import backtype.storm.Config;
import backtype.storm.LocalCluster;
import backtype.storm.generated.StormTopology;
import backtype.storm.spout.SchemeAsMultiScheme;
import backtype.storm.topology.TopologyBuilder;
import backtype.storm.utils.Utils;
import storm.kafka.BrokerHosts;
import storm.kafka.KafkaSpout;
import storm.kafka.SpoutConfig;
import storm.kafka.StringScheme;
import storm.kafka.ZkHosts;
public class FluxTopology {
public static void main(String[] args) {
//SPOUT的id 要求唯一
String KAFKA_SPOUT_ID = "flux_spout";
//要连接的kafka的topic
String CONSUME_TOPIC = "flux_topic";
//要连接的zookeeper的地址
String ZK_HOSTS = "192.168.239.129:2181";
//设定连接服务器的参数
BrokerHosts hosts = new ZkHosts(ZK_HOSTS);
SpoutConfig spoutConfig = new SpoutConfig(hosts, CONSUME_TOPIC, "/" + CONSUME_TOPIC, UUID.randomUUID().toString());
spoutConfig.scheme = new SchemeAsMultiScheme(new StringScheme());
//从kafka读取数据发射
KafkaSpout kafkaSpout = new KafkaSpout(spoutConfig);
//创建TopologyBuilder类实例
TopologyBuilder builder = new TopologyBuilder();
builder.setSpout(KAFKA_SPOUT_ID, kafkaSpout);
//清理数据
builder.setBolt("ClearBolt", new ClearBolt()).shuffleGrouping(KAFKA_SPOUT_ID);
builder.setBolt("PrintBolt", new PrintBolt()).shuffleGrouping("ClearBolt");
StormTopology topology = builder.createTopology();
//--提交Topology给集群运行
Config conf = new Config();
LocalCluster cluster = new LocalCluster();
cluster.submitTopology("MyTopology", conf, topology);
//--运行10秒钟后杀死Topology关闭集群
Utils.sleep(1000 * 1000);
cluster.killTopology("MyTopology");
cluster.shutdown();
}
}
成功获得所需源数据信息