1.环境
apache-storm-1.1.0
Hadoop 2.8.0
使用到的依赖:
<dependencies>
<dependency>
<groupId>org.apache.storm</groupId>
<artifactId>storm-core</artifactId>
<version>1.1.0</version>
</dependency>
<dependency>
<groupId>org.apache.storm</groupId>
<artifactId>storm-hdfs</artifactId>
<version>1.1.0</version>
</dependency>
</dependencies>
需求:读取hdfs上的mapreduce日志文件,统计其中的INFO、WARN、DEBUG、Error日志级别的条数
思路很简单,Spout端读入文件,然后Bolt端做Wordcount(先用正则表达式匹配日志级别)
2.实现
LogLevelCountTopology
用到的参数:
import neu.bolt.CountBolt;
import neu.bolt.ExtractBolt;
import org.apache.storm.Config;
import org.apache.storm.LocalCluster;
import org.apache.storm.generated.AlreadyAliveException;
import org.apache.storm.generated.AuthorizationException;
import org.apache.storm.generated.InvalidTopologyException;
import org.apache.storm.hdfs.spout.HdfsSpout;
import org.apache.storm.hdfs.spout.TextFileReader;
import org.apache.storm.topology.TopologyBuilder;
import org.apache.storm.tuple.Fields;
import java.util.HashMap;
public class LogLevelCountTopology {
public static void main(String[] args) throws InvalidTopologyException, AuthorizationException, AlreadyAliveException, InterruptedException {
System.setProperty("HADOOP_USER_NAME", "root");
if (args.length != 4) {
System.out.println("Usage <HdfsUri SourceDir ArchiveDir BadFilesDir>");
System.exit(1);
}
TopologyBuilder builder = new TopologyBuilder();
HdfsSpout hdfsSpout = new HdfsSpout()
.setReaderType("text")
.withOutputFields(TextFileReader.defaultFields)
.setHdfsUri(args[0])
.setSourceDir(args[1])
.setArchiveDir(args[2])
.setBadFilesDir(args[3]);
HashMap<String, Object> hashMap = new HashMap<>();
//正则表达式
hashMap.put(ExtractBolt.REGEX, ".{23}(INFO|DEBUG|WARN|ERROR)");
hashMap.put(ExtractBolt.FIELD, "line");
builder.setSpout("hdfsSpout", hdfsSpout, 1);
builder.setBolt("extractbolt", new ExtractBolt(), 1)
.addConfigurations(hashMap).shuffleGrouping("hdfsSpout");
builder.setBolt("countBolt", new CountBolt(), 1)
.fieldsGrouping("extractbolt", new Fields("level"));
Config conf = new Config();
conf.setDebug(true);
conf.setMaxTaskParallelism(1);
LocalCluster cluster = new LocalCluster();
cluster.submitTopology("hdfsLogLevelCountTopology", conf, builder.createTopology());
Thread.sleep(90000);
cluster.shutdown();
}
}
ExtractBolt
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.IRichBolt;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class ExtractBolt implements IRichBolt {
public static final String REGEX = "regex";
public static final String FIELD = "field";
String field;
Pattern regex;
OutputCollector collector;
public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
String regexString = (String) stormConf.get(REGEX);
this.collector = collector;
this.field = (String) stormConf.get(FIELD);
this.regex = Pattern.compile(regexString);
}
public void execute(Tuple input) {
String log = input.getStringByField(field);
if (log != null) {
Matcher matcher = regex.matcher(log);
if (matcher.find()) {
String level = matcher.group(1);
collector.emit(new Values(level));
} else {
System.err.println("不包含INFO|DEBUG|ERROR|WARN 日志:" + log);
}
}
collector.ack(input);
}
public void cleanup() {
}
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("level"));
}
public Map<String, Object> getComponentConfiguration() {
return null;
}
}
CountBolt
import org.apache.storm.topology.BasicOutputCollector;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseBasicBolt;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
import java.util.HashMap;
import java.util.Map;
public class CountBolt extends BaseBasicBolt {
private Map<String, Integer> counts = new HashMap<>();
public void execute(Tuple input, BasicOutputCollector collector) {
String level = input.getStringByField("level");
Integer count = counts.get(level);
if (count == null)
count = 0;
count++;
counts.put(level, count);
System.out.println(level + " : " + count);
collector.emit(new Values(level, count));
}
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("level", "count"));
}
}
主函数传参示例hdfs://172.17.11.85:9000 /log /ArchiveDir /BadFilesDir
给一张IDE控制台的输出日志: