一、分组策略grouping ------------------------------------------------------------------------- 1.shuffle -- 随机分组,无规律 2.field -- 根据关键字的key进行hash处理,相同的key会进入同一组 3.all -- 将数据发送给所有的下家,每个下家都会持有一个消息的副本。广播模式。 4.direct -- 将数据发送给指定的下家。只有指定的下家才会收到消息。点对点模式。 a.获取定向目标的taskid int taskID = 0; Map<Integer, String> map = context.getTaskToComponent(); Set<Integer> set = map.keySet(); for(Integer i : set) { if(map.get(i).equals("spiltBolt")) { taskID = i; break; } } b.向指定目标发送tuple collector.emitDirect(taskID,new Values(line)); c.app设定分组方式为定向分组 builder.setBolt("spiltBolt", new SpiltBolt(), 8).directGrouping("wordSpout").setNumTasks(4); 5.global -- 对所有下家的TaskID进行排序,然后将消息全部发送给id排在首位的Task(发送给id最小的)。可以说是特殊的direct a.app设定分组方式为全局分组 builder.setBolt("spiltBolt", new SpiltBolt(), 4).globalGrouping("wordSpout").setNumTasks(4); 6.自定义分组 a.自定义CustomStreamGrouping类 ---------------------------------------------------------
package test.storm.group.custom;
import org.apache.storm.generated.GlobalStreamId;
import org.apache.storm.grouping.CustomStreamGrouping;
import org.apache.storm.task.WorkerTopologyContext;
import java.util.ArrayList;
import java.util.List;
/**
* 自定义分组
*/
public class MyGrouping implements CustomStreamGrouping {
//接收数据的下家task id集合
private List<Integer> targetTaskIds = new ArrayList<Integer>();
public void prepare(WorkerTopologyContext context, GlobalStreamId stream, List<Integer> targetTasks) {
targetTaskIds = targetTasks;
}
public List<Integer> chooseTasks(int taskId, List<Object> values) {
List<Integer> returnIds = new ArrayList<Integer>();
//取所有目标的前半段
for (int i = 0; i <targetTaskIds.size() / 2 ; i++) {
returnIds.add(targetTaskIds.get(i));
}
return returnIds;
}
}
b.App设定分组为自定义分组 builder.setBolt("spiltBolt", new SpiltBolt(), 4).customGrouping("wordSpout",new MyGrouping()).setNumTasks(4); 二、修改Storm的打印日志 ------------------------------------------------------------------ storm-core-1.3.jar下。 [main/resources/log4j2.xml目录下] <configuration monitorInterval="60"> <Appenders> <Console name="Console" target="SYSTEM_OUT"> <PatternLayout pattern="%-4r [%t] %-5p %c{1.} - %msg%n"/> </Console> </Appenders> <Loggers> <Logger name="org.apache.zookeeper" level="ERROR"/> <Root level="error"> <AppenderRef ref="Console"/> </Root> </Loggers> </configuration> 三、确保消费机制ack/fali ---------------------------------------------------------------- 1.akc()函数 当一个tuple被完全处理完毕时的回调函数 2.fail()函数 当一个tiple失败的时候的回调函数 3.msgID 想要确保消费,那么spout发射的tuple必须保证是携带有msgID的 4.在最后一道bolt中需要对tuple进行ack确认,这个时候回调ack()函数,告知tuple被成功消费 5.如果tuple在bolt工序的某一个bolt中失败,那么就会回调fail()函数,告知tuple消费失败 四、双集合确保消费 ------------------------------------------------------------------------ 1.sport改造 ----------------------------------------------------
package test.storm.group.custom;
import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.IRichSpout;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Values;
import java.util.HashMap;
import java.util.Map;
import java.util.Random;
/**
* 单词产生源spout -- 水龙头
*/
public class WordSpout implements IRichSpout {
private TopologyContext context;
private SpoutOutputCollector collector;
//存放所有的tuple数据
private Map<Long, String> msgMap = new HashMap<Long, String>();
//失败的数据集合
private Map<Long, Integer> failMap = new HashMap<Long, Integer>();
public void open(Map map, TopologyContext topologyContext, SpoutOutputCollector spoutOutputCollector) {
context = topologyContext;
collector = spoutOutputCollector;
}
public void close() {
}
public void activate() {
}
public void deactivate() {
}
/**
* 下一个
*/
public void nextTuple() {
String line = "how are you" + " tom" + new Random().nextInt(100);
//储存消息
Long msgId = System.currentTimeMillis();
msgMap.put(msgId, line);
collector.emit(new Values(line),msgId);
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
public void ack(Object o) {
if (msgMap.containsKey(o)) {
msgMap.remove(o);
}
if (failMap.containsKey(o)) {
failMap.remove(o);
}
}
public void fail(Object o) {
if (failMap.containsKey(o)) {
int count = failMap.get(o);
count ++;
if (count > 3) {
//取消这条消息
msgMap.remove(o);
failMap.remove(o);
} else {
failMap.put((Long)o,count);
//重发
String line = msgMap.get(o);
collector.emit(new Values(line),o);
}
}
else
{
failMap.put((Long)o,1);
//重发
String line = msgMap.get(o);
collector.emit(new Values(line),o);
}
}
public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
outputFieldsDeclarer.declare(new Fields("line"));
}
public Map<String, Object> getComponentConfiguration() {
return null;
}
}
五、storm上集成kafka -------------------------------------------------------------- 1.storm作为kafka的消费者,从kafka队列提取消息,交给storm集群进行计算 2.添加maven依赖 <dependency> <groupId>org.apache.storm</groupId> <artifactId>storm-kafka</artifactId> <version>1.0.3</version> </dependency> <dependency> <groupId>org.apache.kafka</groupId> <artifactId>kafka_2.10</artifactId> <version>0.8.1.1</version> <exclusions> <exclusion> <groupId>org.apache.zookeeper</groupId> <artifactId>zookeeper</artifactId> </exclusion> <exclusion> <groupId>log4j</groupId> <artifactId>log4j</artifactId> </exclusion> </exclusions> </dependency> 3.启动kafka 和 storm集群 4.重写App类 ----------------------------------------------
package test.storm.kafka;
import org.apache.storm.Config;
import org.apache.storm.LocalCluster;
import org.apache.storm.kafka.*;
import org.apache.storm.spout.SchemeAsMultiScheme;
import org.apache.storm.topology.TopologyBuilder;
import java.util.UUID;
public class App {
public static void main(String [] args)
{
TopologyBuilder builder = new TopologyBuilder();
//zk连接串
String zkConnString = "s200:2181,s300:2181,s400:2181";
//
BrokerHosts hosts = new ZkHosts(zkConnString);
//Spout配置
SpoutConfig spoutConfig = new SpoutConfig(hosts, "test", "/test", UUID.randomUUID().toString());
spoutConfig.scheme = new SchemeAsMultiScheme(new StringScheme());
KafkaSpout kafkaSpout = new KafkaSpout(spoutConfig);
builder.setSpout("kafkaspout", kafkaSpout).setNumTasks(2);
builder.setBolt("split-bolt", new SpiltBolt(),2).shuffleGrouping("kafkaspout").setNumTasks(2);
Config conf = new Config();
conf.setNumWorkers(2);
conf.setDebug(true);
/**
* 本地模式storm
*/
LocalCluster cluster = new LocalCluster();
cluster.submitTopology("wc", conf, builder.createTopology());
}
}
六、storm上集成hbase -------------------------------------------------------------- 1.描述 将计算结果写入到hbase数据库中。 hbase 高吞吐量 随机定位 实时读写。 master regionServer | region | wal | hadoop 2.创建hbase wordcount表,f1 $>hbase shell $hbase shell>create 'ns1:wordcount' , 'f1' 3.添加Maven依赖 <dependencies> <dependency> <groupId>org.apache.storm</groupId> <artifactId>storm-core</artifactId> <version>1.0.3</version> </dependency> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.11</version> </dependency> <dependency> <groupId>org.apache.storm</groupId> <artifactId>storm-kafka</artifactId> <version>1.0.2</version> </dependency> <dependency> <groupId>log4j</groupId> <artifactId>log4j</artifactId> <version>1.2.17</version> </dependency> <dependency> <groupId>org.apache.kafka</groupId> <artifactId>kafka_2.10</artifactId> <version>0.8.1.1</version> <exclusions> <exclusion> <groupId>org.apache.zookeeper</groupId> <artifactId>zookeeper</artifactId> </exclusion> <exclusion> <groupId>log4j</groupId> <artifactId>log4j</artifactId> </exclusion> </exclusions> </dependency> <dependency> <groupId>org.apache.storm</groupId> <artifactId>storm-hbase</artifactId> <version>1.0.3</version> </dependency> <dependency> <groupId>org.apache.hbase</groupId> <artifactId>hbase-client</artifactId> <version>1.2.3</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-common</artifactId> <version>2.7.3</version> </dependency> </dependencies> 4.拷贝hbase的配置文件[hbase-site.xml/hdfs-site.xml]到resourcs目录下 5.HbaseBolt ----------------------------------------------------------
package test.storm.hbase;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.IRichBolt;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.tuple.Tuple;
import java.io.IOException;
import java.util.Map;
/**
* 写入数据到hbase中
*/
public class HbaseBolt implements IRichBolt {
private TopologyContext context;
private OutputCollector collector;
private Table tb;
public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
this.context = context;
this.collector = collector;
try {
//获取配置文件
Configuration conf = HBaseConfiguration.create();
//工厂类创建连接
Connection conn = ConnectionFactory.createConnection(conf);
//get table
TableName tbName = TableName.valueOf("ns1:wordcount");
tb = conn.getTable(tbName);
} catch (IOException e) {
e.printStackTrace();
}
}
public void execute(Tuple input) {
String word = input.getString(0);
int count = input.getInteger(1);
System.out.println("word : count => " + word + ":" + count);
//使用hbase的increment计数器机制,进行单词累加
byte[] rowkey = Bytes.toBytes(word);
byte[] f = Bytes.toBytes("f1");
byte[] c = Bytes.toBytes("count");
try {
tb.incrementColumnValue(rowkey,f,c,count);
} catch (IOException e) {
}
}
public void cleanup() {
}
public void declareOutputFields(OutputFieldsDeclarer declarer) {
}
public Map<String, Object> getComponentConfiguration() {
return null;
}
}
6.查看hbase表数据 $hbase> get_counter 'ns1:wordcount' , 'word' , 'f1:count'