1.温故而知新,使用词频统计实现TopN,以下是使用到的依赖:
<dependencies>
<dependency>
<groupId>org.apache.storm</groupId>
<artifactId>storm-core</artifactId>
<version>1.1.0</version>
</dependency>
<dependency>
<groupId>org.apache.storm</groupId>
<artifactId>storm-hbase</artifactId>
<version>1.1.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.3</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.zookeeper</groupId>
<artifactId>zookeeper</artifactId>
<version>3.4.6</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</exclusion>
</exclusions>
</dependency>
</dependencies>
2.代码实现
import org.apache.hadoop.hbase.client.Durability
import org.apache.storm.Config
import org.apache.storm.LocalCluster
import org.apache.storm.StormSubmitter
import org.apache.storm.generated.AlreadyAliveException
import org.apache.storm.generated.AuthorizationException
import org.apache.storm.generated.InvalidTopologyException
import org.apache.storm.hbase.trident.mapper.SimpleTridentHBaseMapper
import org.apache.storm.hbase.trident.mapper.TridentHBaseMapper
import org.apache.storm.hbase.trident.state.HBaseState
import org.apache.storm.hbase.trident.state.HBaseStateFactory
import org.apache.storm.hbase.trident.state.HBaseUpdater
import org.apache.storm.topology.base.BaseWindowedBolt
import org.apache.storm.trident.TridentTopology
import org.apache.storm.trident.operation.BaseAggregator
import org.apache.storm.trident.operation.BaseFunction
import org.apache.storm.trident.operation.FlatMapFunction
import org.apache.storm.trident.operation.TridentCollector
import org.apache.storm.trident.state.StateFactory
import org.apache.storm.trident.testing.FixedBatchSpout
import org.apache.storm.trident.tuple.TridentTuple
import org.apache.storm.trident.windowing.config.SlidingDurationWindow
import org.apache.storm.trident.windowing.config.WindowConfig
import org.apache.storm.tuple.Fields
import org.apache.storm.tuple.Values
import java.util.*
import java.util.concurrent.TimeUnit
public class TopNTopology {
private static class TopNFunction extends BaseFunction {
private int TOPN
public TopNFunction(int n) {
this.TOPN = n
}
@Override
public void execute(TridentTuple tuple, TridentCollector collector) {
HashMap<String, Long> hashMap = (HashMap<String, Long>) tuple.get(0)
List<Map.Entry<String, Long>> list = new ArrayList<>(hashMap.entrySet())
Collections.sort(list, new Comparator<Map.Entry<String, Long>>() {
@Override
public int compare(Map.Entry<String, Long> o1, Map.Entry<String, Long> o2) {
return o1.getValue().compareTo(o2.getValue())
}
})
int i = 1
for (int j = list.size() - 1
if (i > TOPN)
break
collector.emit(new Values(String.valueOf(i), list.get(j).getKey(), String.valueOf(list.get(j).getValue())))
System.out.println("Sending: " + i + " " + list.get(j).getKey() + ": " + list.get(j).getValue())
i++
}
System.out.println("----------------done----------------------")
}
}
private static class SplitFunction implements FlatMapFunction {
@Override
public Iterable<Values> execute(TridentTuple input) {
ArrayList<Values> values = new ArrayList<>()
String sentence = input.getStringByField("sentence")
String[] split = sentence.split(" ")
for (String s : split) {
values.add(new Values(s))
}
return values
}
}
private static class WordAggregator extends BaseAggregator<HashMap<String, Long>> {
@Override
public HashMap<String, Long> init(Object batchId, TridentCollector collector) {
return new HashMap<>()
}
@Override
public void aggregate(HashMap<String, Long> val, TridentTuple tuple, TridentCollector collector) {
String word = tuple.getStringByField("word")
long count = 1
if (val.containsKey(word))
count += val.get(word)
val.put(word, count)
}
@Override
public void complete(HashMap<String, Long> val, TridentCollector collector) {
collector.emit(new Values(val))
}
}
public static void main(String[] args) throws InvalidTopologyException, AuthorizationException, AlreadyAliveException {
FixedBatchSpout spout = new FixedBatchSpout(new Fields("sentence"), 3,
new Values("the cow jumped over the moon"),
new Values("the man went to the store and bought some candy"),
new Values("four score and seven years ago"),
new Values("how many apples can you eat"),
new Values("to be or not to be the person"))
spout.setCycle(true)
TridentHBaseMapper tridentHBaseMapper = new SimpleTridentHBaseMapper()
.withColumnFamily("result")
.withColumnFields(new Fields("word", "count"))
.withRowKeyField("rank")
HBaseState.Options options = new HBaseState.Options()
.withConfigKey("hbase")
.withDurability(Durability.SYNC_WAL)
.withMapper(tridentHBaseMapper)
.withTableName("Top5Count")
StateFactory hBaseStateFactory = new HBaseStateFactory(options)
WindowConfig durationWindow = SlidingDurationWindow.of(BaseWindowedBolt.Duration.seconds(10), BaseWindowedBolt.Duration.seconds(5))
TridentTopology topology = new TridentTopology()
topology.newStream("fixedSpout", spout)
.flatMap(new SplitFunction(), new Fields("word"))
.window(durationWindow, new Fields("word"), new WordAggregator(), new Fields("wordcount"))
.each(new Fields("wordcount"), new TopNFunction(5), new Fields("rank", "word", "count"))
.partitionPersist(hBaseStateFactory, new Fields("rank", "word", "count"), new HBaseUpdater(), new Fields())
Config conf = new Config()
conf.put("hbase", new HashMap<String, Object>())
if (args.length == 0) {
LocalCluster cluster = new LocalCluster()
cluster.submitTopology("Top5Topology", conf, topology.build())
} else {
conf.setNumWorkers(3)
StormSubmitter.submitTopologyWithProgressBar(args[0], conf, topology.build())
}
}
}