1.shuffle :随机分组
2.field分组
安装指定filed的key进行hash处理,
相同的field,一定进入到同一bolt.
该分组容易产生数据倾斜问题,通过使用二次聚合避免此类问题。
3.使用二次聚合避免倾斜。
App类:
package com.mao.storm.group.shuffle;
import org.apache.storm.Config;
import org.apache.storm.LocalCluster;
import org.apache.storm.topology.TopologyBuilder;
import org.apache.storm.tuple.Fields;
/**
* App
*/
public class App {
public static void main(String[] args) throws Exception {
TopologyBuilder builder = new TopologyBuilder();
//设置spout
builder.setSpout("wcSpout",new WordCountSpout()).setNumTasks(2);
//设置creator-Bolt
builder.setBolt("split-bolt",new SplitBolt(),1).shuffleGrouping("wcSpout").setNumTasks(1);
//设置countor-Bolt
builder.setBolt("count-Bolt1",new WordCountBolt(),1).shuffleGrouping("split-bolt").setNumTasks(1);
//设置countor-Bolt
builder.setBolt("count-Bolt2",new WordCountBolt(),3).fieldsGrouping("count-Bolt1",new Fields("word")).setNumTasks(3);
Config config = new Config();
config.setNumWorkers(2);
config.setDebug(true);
LocalCluster cluster = new LocalCluster();
cluster.submitTopology("wcShuffle",config,builder.createTopology());
Thread.sleep(20000);
cluster.shutdown();
System.out.println("over");
}
}
SplitBolt类:
package com.mao.storm.group.shuffle;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.IRichBolt;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
import java.util.Map;
public class SplitBolt implements IRichBolt {
private TopologyContext context;
private OutputCollector collector;
public void prepare(Map map, TopologyContext context, OutputCollector collector) {
this.context = context;
this.collector = collector;
}
public void execute(Tuple tuple) {
String line = tuple.getString(0);
String[] arr = line.split(" ");
for (String s : arr){
collector.emit(new Values(s,1));
}
}
public void cleanup() {
}
public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
outputFieldsDeclarer.declare(new Fields("word","count"));
}
public Map<String, Object> getComponentConfiguration() {
return null;
}
}
WordCountBolt类:
package com.mao.storm.group.shuffle;
import com.mao.storm.util.Util;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.IRichBolt;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
import java.util.HashMap;
import java.util.Map;
/**
* countbolt,使用二次聚合,解决数据倾斜问题。
* 一次聚合和二次聚合使用field分组,完成数据的最终统计。
* 一次聚合和上次split工作使用
*/
public class WordCountBolt implements IRichBolt {
private TopologyContext context;
private OutputCollector collector;
private Map<String,Integer> map1;
private long lastEmitTime = 0;
private long duration = 5000;
public void prepare(Map map, TopologyContext context, OutputCollector collector) {
this.context = context;
this.collector = collector;
map1 = new HashMap<String, Integer>();
}
public void execute(Tuple tuple) {
String word = tuple.getString(0);
Integer count = tuple.getInteger(1);
Util.sendToLocalhost(this, word);
if (!map1.containsKey(word)){
map1.put(word,count);
}else {
map1.put(word,map1.get(word) + count);
}
long nowTime = System.currentTimeMillis();
if (nowTime - lastEmitTime >duration){
for (Map.Entry<String,Integer> entry : map1.entrySet()){
collector.emit(new Values(entry.getKey(),entry.getValue()));
}
map1.clear();
lastEmitTime = nowTime;
}
}
public void cleanup() {
for (Map.Entry<String,Integer> entry : map1.entrySet()){
System.out.println("wordCountNums:"+entry.getKey()+" : "+entry.getValue());
}
}
public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
outputFieldsDeclarer.declare(new Fields("word","count"));
}
public Map<String, Object> getComponentConfiguration() {
return null;
}
}
WordCountSpout类:
package com.mao.storm.group.shuffle;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.IRichBolt;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
import java.util.Map;
public class SplitBolt implements IRichBolt {
private TopologyContext context;
private OutputCollector collector;
public void prepare(Map map, TopologyContext context, OutputCollector collector) {
this.context = context;
this.collector = collector;
}
public void execute(Tuple tuple) {
String line = tuple.getString(0);
String[] arr = line.split(" ");
for (String s : arr){
collector.emit(new Values(s,1));
}
}
public void cleanup() {
}
public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
outputFieldsDeclarer.declare(new Fields("word","count"));
}
public Map<String, Object> getComponentConfiguration() {
return null;
}
}