LogParserBolt类
package com.ibeifeng.bigdata.storm.weglog; import backtype.storm.task.OutputCollector; import backtype.storm.task.TopologyContext; import backtype.storm.topology.IBasicBolt; import backtype.storm.topology.IRichBolt; import backtype.storm.topology.OutputFieldsDeclarer; import backtype.storm.tuple.Fields; import backtype.storm.tuple.Tuple; import backtype.storm.tuple.Values; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.Date; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import static com.ibeifeng.bigdata.storm.weglog.WebLogConstants.*; /** * 日志解析类 * Created by ad on 2016/12/17. */ public class LogParserBolt implements IRichBolt { private Pattern pattern; private OutputCollector collector; @Override public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) { pattern = Pattern.compile("([^ ]*) [^ ]* [^ ]* \\[([\\d+]*)\\] \\\"[^ ]* ([^ ]*) [^ ]*\\\" \\d{3} \\d+ \\\"([^\"]*)\\\" \\\"([^\"]*)\\\" \\\"[^ ]*\\\""); this.collector = collector; } @Override public void execute(Tuple input) { String webLog = input.getStringByField("str"); // 解析 if(webLog!= null || !"".equals(webLog)){ Matcher matcher = pattern.matcher(webLog); if(matcher.find()){ //matcher.group(0); String ip = matcher.group(1); String serverTimeStr = matcher.group(2); // 处理时间 long timestamp = Long.parseLong(serverTimeStr); Date date = new Date(); date.setTime(timestamp); DateFormat df = new SimpleDateFormat("yyyyMMddHHmm"); String dateStr = df.format(date); String day = dateStr.substring(0,8); String hour = dateStr.substring(0,10); String minute = dateStr ; String requestUrl = matcher.group(3); String httpRefer = matcher.group(4); String userAgent = matcher.group(5); // 分流 this.collector.emit(IP_COUNT_STREAM, input,new Values(day, hour, minute, ip)); this.collector.emit(URL_PARSER_STREAM, input,new Values(day, hour, minute, requestUrl)); this.collector.emit(HTTPREFER_PARSER_STREAM, input,new Values(day, hour, minute, httpRefer)); this.collector.emit(USERAGENT_PARSER_STREAM, input,new Values(day, hour, minute, userAgent)); } } this.collector.ack(input); } @Override public void cleanup() { } @Override public void declareOutputFields(OutputFieldsDeclarer declarer) { declarer.declareStream(IP_COUNT_STREAM,new Fields(DAY, HOUR, MINUTE, IP)); declarer.declareStream(URL_PARSER_STREAM,new Fields(DAY, HOUR, MINUTE, REQUEST_URL)); declarer.declareStream(HTTPREFER_PARSER_STREAM,new Fields(DAY, HOUR, MINUTE, HTTP_REFER)); declarer.declareStream(USERAGENT_PARSER_STREAM,new Fields(DAY, HOUR, MINUTE, USERAGENT)); } @Override public Map<String, Object> getComponentConfiguration() { return null; } }
UserAgentParserBolt类
package com.ibeifeng.bigdata.storm.weglog; import backtype.storm.task.OutputCollector; import backtype.storm.task.TopologyContext; import backtype.storm.topology.IBasicBolt; import backtype.storm.topology.IRichBolt; import backtype.storm.topology.OutputFieldsDeclarer; import backtype.storm.tuple.Fields; import backtype.storm.tuple.Tuple; import backtype.storm.tuple.Values; import com.ibeifeng.bigdata.storm.util.UserAgentUtil; import static com.ibeifeng.bigdata.storm.weglog.WebLogConstants.*; import java.util.Map; /** * 解析UserAgent * Created by ad on 2016/12/18. */ public class UserAgentParserBolt implements IRichBolt { private OutputCollector _collector; @Override public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) { this._collector = collector; } @Override public void execute(Tuple input) { String day = input.getStringByField(DAY); String hour = input.getStringByField(HOUR); String minute = input.getStringByField(MINUTE); String userAgent = input.getStringByField(USERAGENT); // 解析userAgent if(userAgent != null && !"".equals(userAgent)){ UserAgentUtil.UserAgentInfo userAgentInfo = UserAgentUtil.analyticUserAgent(userAgent); if(userAgentInfo!= null){ String browserName = userAgentInfo.getBrowserName(); String browserVersion = userAgentInfo.getBrowserVersion(); if(browserName!=null && !"".equals(browserName)){ // 只考虑浏览器的类型 this._collector.emit(BROWSER_COUNT_STREAM,input,new Values(day,hour,minute,browserName)); if(browserVersion!=null && !"".equals(browserVersion)){ this._collector.emit(BROWSER_COUNT_STREAM,input,new Values(day,hour,minute, browserName+"_"+browserVersion)); } } String osName = userAgentInfo.getOsName(); String osVersion = userAgentInfo.getOsVersion(); if(osName!= null && !"".equals(osName)){ this._collector.emit(OS_COUNT_STREAM,input,new Values(day,hour,minute, osName)); if(osVersion != null && !"".equals(osVersion)){ this._collector.emit(OS_COUNT_STREAM,input,new Values(day,hour,minute,osName+"_"+osVersion)); } } } } this._collector.ack(input); } @Override public void cleanup() { } @Override public void declareOutputFields(OutputFieldsDeclarer declarer) { declarer.declareStream(BROWSER_COUNT_STREAM,new Fields(DAY,HOUR,MINUTE,BROWSER)); declarer.declareStream(OS_COUNT_STREAM,new Fields(DAY,HOUR,MINUTE,OS)); } @Override public Map<String, Object> getComponentConfiguration() { return null; } }
CountKpiBolt类
package com.ibeifeng.bigdata.storm.weglog; import backtype.storm.task.OutputCollector; import backtype.storm.task.TopologyContext; import backtype.storm.topology.IRichBolt; import backtype.storm.topology.OutputFieldsDeclarer; import backtype.storm.tuple.Fields; import backtype.storm.tuple.Tuple; import backtype.storm.tuple.Values; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import static com.ibeifeng.bigdata.storm.weglog.WebLogConstants.*; /** * 通用的计数Bolt * Created by ad on 2016/12/17. */ public class CountKpiBolt implements IRichBolt { private String kpiType; //TODO 优化 替换为内存数据库 redis private Map<String,Integer> kpiCounts; private String currentDay = ""; private OutputCollector _collector; public CountKpiBolt(String kpiType){ this.kpiType = kpiType; } @Override public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) { this.kpiCounts = new HashMap<>(); this._collector = collector; } @Override public void execute(Tuple input) { String day = input.getStringByField("day"); String hour = input.getStringByField("hour"); String minute = input.getStringByField("minute"); String kpi = input.getString(3); String kpiByDay = day + "_" + kpi; String kpiByHour = hour +"_" + kpi; String kpiByMinute = minute + "_" + kpi; // 隔天清理内存 if(!currentDay.equals(day)){ // 说明隔天了 Iterator<Map.Entry<String,Integer>> iter = kpiCounts.entrySet().iterator(); while(iter.hasNext()){ Map.Entry<String,Integer> entry = iter.next(); if(entry.getKey().startsWith(currentDay)){ iter.remove(); } } } currentDay = day; int kpiCountByDay = 0; int kpiCountByHour = 0; int kpiCountByMinute = 0; if(kpiCounts.containsKey(kpiByDay)){ kpiCountByDay = kpiCounts.get(kpiByDay); } if(kpiCounts.containsKey(kpiByHour)){ kpiCountByHour = kpiCounts.get(kpiByHour); } if(kpiCounts.containsKey(kpiByMinute)){ kpiCountByMinute = kpiCounts.get(kpiByMinute); } kpiCountByDay ++; kpiCountByHour ++; kpiCountByMinute ++; kpiCounts.put(kpiByDay, kpiCountByDay); kpiCounts.put(kpiByHour, kpiCountByHour); kpiCounts.put(kpiByMinute,kpiCountByMinute); this._collector.emit(input, new Values(kpiType+"_" + kpiByDay, kpiCountByDay)); this._collector.emit(input, new Values(kpiType+"_" + kpiByHour, kpiCountByHour)); this._collector.emit(input, new Values(kpiType+"_" + kpiByMinute, kpiCountByMinute)); this._collector.ack(input); } @Override public void cleanup() { } @Override public void declareOutputFields(OutputFieldsDeclarer declarer) { declarer.declare(new Fields(SERVERTIME_KPI, KPI_COUNTS)); } @Override public Map<String, Object> getComponentConfiguration() { return null; } }
SavaBolt类
``` package com.ibeifeng.bigdata.storm.weglog; import backtype.storm.task.OutputCollector; import backtype.storm.task.TopologyContext; import backtype.storm.topology.IBasicBolt; import backtype.storm.topology.IRichBolt; import backtype.storm.topology.OutputFieldsDeclarer; import backtype.storm.tuple.Tuple; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.RetriesExhaustedWithDetailsException; import org.apache.hadoop.hbase.util.Bytes; import java.io.IOException; import java.io.InterruptedIOException; import java.util.Map; import static com.ibeifeng.bigdata.storm.weglog.WebLogConstants.*; /** * 将数据存储到HBase数据库中 * Created by ad on 2016/12/17. */ public class SaveBolt implements IRichBolt{ private HTable table; private OutputCollector _collector; @Override public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) { Configuration configuration = HBaseConfiguration.create(); try { table = new HTable(configuration,HBASE_TABLENAME); } catch (IOException e) { e.printStackTrace(); throw new RuntimeException(e); } this._collector = collector; } @Override public void execute(Tuple input) { String serverTimeAndKpi = input.getStringByField(SERVERTIME_KPI); Integer kpiCounts = input.getIntegerByField(KPI_COUNTS); System.err.println("serverTimeAndKpi=" + serverTimeAndKpi + ", kpiCounts=" + kpiCounts); if(serverTimeAndKpi!= null && kpiCounts != null){ Put put = new Put(Bytes.toBytes(serverTimeAndKpi)); String columnQuelifier = serverTimeAndKpi.split("_")[0]; put.add(Bytes.toBytes(COLUMN_FAMILY), Bytes.toBytes(columnQuelifier),Bytes.toBytes(""+kpiCounts)); try { table.put(put); } catch (IOException e) { throw new RuntimeException(e); } } this._collector.ack(input); } @Override public void cleanup() { if(table!= null){ try { table.close(); } catch (IOException e) { e.printStackTrace(); } } } @Override public void declareOutputFields(OutputFieldsDeclarer declarer) { } @Override public Map<String, Object> getComponentConfiguration() { return null; } } ```
常量类WebLogConstants
package com.ibeifeng.bigdata.storm.weglog; /** * 常量类 * Created by ad on 2016/12/17. */ public class WebLogConstants { public static final String KAFKA_SPOUT_ID = "kafkaSpoutId"; public static final String WEB_LOG_PARSER_BOLT = "webLogParserBolt"; public static final String COUNT_IP_BOLT = "countIpBolt"; public static final String COUNT_BROWSER_BOLT = "countBrowserBolt"; public static final String COUNT_OS_BOLT = "countOsBolt"; public static final String USER_AGENT_PARSER_BOLT = "userAgentParserBolt"; public static final String SAVE_BOLT = "saveBolt"; // 流ID public static final String IP_COUNT_STREAM = "ipCountStream"; public static final String URL_PARSER_STREAM = "urlParserStream"; public static final String HTTPREFER_PARSER_STREAM = "httpReferParserStream"; public static final String USERAGENT_PARSER_STREAM = "userAgentParserStream"; public static final String BROWSER_COUNT_STREAM = "browserCountStream"; public static final String OS_COUNT_STREAM = "osCountStream"; // tuple key名称 public static final String DAY = "day"; public static final String HOUR = "hour"; public static final String MINUTE = "minute"; public static final String IP = "ip"; public static final String REQUEST_URL = "requestUrl"; public static final String HTTP_REFER = "httpRefer"; public static final String USERAGENT = "userAgent"; public static final String BROWSER = "browser"; public static final String OS = "os"; public static final String SERVERTIME_KPI = "serverTimeAndKpi"; public static final String KPI_COUNTS = "kpiCounts"; // kpi类型 public static final String IP_KPI = "I"; public static final String URL_KPI = "U"; public static final String BROWSER_KPI = "B"; public static final String OS_KPI = "O"; // 表名称 public static final String HBASE_TABLENAME = "weblogstatictis"; public static final String COLUMN_FAMILY = "info"; }
测试类WebLogStatictis
package com.ibeifeng.bigdata.storm.weglog; import backtype.storm.Config; import backtype.storm.LocalCluster; import backtype.storm.StormSubmitter; import backtype.storm.generated.AlreadyAliveException; import backtype.storm.generated.InvalidTopologyException; import backtype.storm.generated.StormTopology; import backtype.storm.spout.SchemeAsMultiScheme; import backtype.storm.topology.IRichSpout; import backtype.storm.topology.TopologyBuilder; import backtype.storm.tuple.Fields; import storm.kafka.*; import static com.ibeifeng.bigdata.storm.weglog.WebLogConstants.*; import java.util.UUID; /** * Created by ad on 2016/12/17. */ public class WebLogStatictis { public static void main(String[] args) { WebLogStatictis webLogStatictis = new WebLogStatictis(); StormTopology topology = webLogStatictis.buildTopology(); Config conf = new Config(); //conf.setNumAckers(4); if(args == null || args.length == 0){ // 本地执行 //conf.setMessageTimeoutSecs(1); // tuple发射超时时间 LocalCluster localCluster = new LocalCluster(); localCluster.submitTopology("webloganalyse", conf , topology); }else{ // 提交到集群上执行 conf.setNumWorkers(4); // 指定使用多少个进程来执行该Topology try { StormSubmitter.submitTopology(args[0],conf, topology); } catch (AlreadyAliveException e) { e.printStackTrace(); } catch (InvalidTopologyException e) { e.printStackTrace(); } } } /** * 构造一个kafkaspout * @return */ private IRichSpout generateSpout(){ BrokerHosts hosts = new ZkHosts("bigdata01.com:2181"); String topic = "nginxlog"; String zkRoot = "/" + topic; String id = UUID.randomUUID().toString(); SpoutConfig spoutConf = new SpoutConfig(hosts,topic,zkRoot,id); spoutConf.scheme = new SchemeAsMultiScheme(new StringScheme()); // 按字符串解析 spoutConf.forceFromStart = true;//从头开发消费 KafkaSpout kafkaSpout = new KafkaSpout(spoutConf); return kafkaSpout; } private StormTopology buildTopology(){ // 构造Topology TopologyBuilder builder = new TopologyBuilder(); // 指定Spout builder.setSpout(KAFKA_SPOUT_ID, generateSpout()); builder.setBolt(WEB_LOG_PARSER_BOLT,new LogParserBolt()) .shuffleGrouping(KAFKA_SPOUT_ID); // 将countIPBolt builder.setBolt(COUNT_IP_BOLT, new CountKpiBolt(IP_KPI)) .fieldsGrouping(WEB_LOG_PARSER_BOLT, IP_COUNT_STREAM, new Fields(IP)); // userAgentParserBolt builder.setBolt(USER_AGENT_PARSER_BOLT, new UserAgentParserBolt()) .shuffleGrouping(WEB_LOG_PARSER_BOLT,USERAGENT_PARSER_STREAM); builder.setBolt(COUNT_BROWSER_BOLT,new CountKpiBolt(BROWSER_KPI)) .fieldsGrouping(USER_AGENT_PARSER_BOLT,BROWSER_COUNT_STREAM, new Fields(BROWSER)); builder.setBolt(COUNT_OS_BOLT,new CountKpiBolt(OS_KPI)) .fieldsGrouping(USER_AGENT_PARSER_BOLT,OS_COUNT_STREAM, new Fields(OS)); builder.setBolt(SAVE_BOLT ,new SaveBolt(),3) .shuffleGrouping(COUNT_IP_BOLT) .shuffleGrouping(COUNT_BROWSER_BOLT) .shuffleGrouping(COUNT_OS_BOLT) ; return builder.createTopology(); } }