Storm Word Count 内容讲解

最新推荐文章于 2020-03-20 20:06:33 发布

zhuhailong

最新推荐文章于 2020-03-20 20:06:33 发布

阅读量1.1k

点赞数 1

分类专栏： Storm 文章标签： Storm Word Count 内容讲解

本文链接：https://blog.csdn.net/u011317245/article/details/52449485

版权

Storm 专栏收录该内容

7 篇文章 0 订阅

订阅专栏

public class SentenceSpout extends BaseRichSpout{
	
	private static final long serialVersionUID = 1L;

	/**
	 * This output collector exposes the API for emitting tuples from an {@link org.apache.storm.topology.IRichSpout}.
	 * The main difference between this output collector and {@link OutputCollector}
	 * for {@link org.apache.storm.topology.IRichBolt} is that spouts can tag messages with ids so that they can be
	 * acked or failed later on. This is the Spout portion of Storm's API to
	 * guarantee that each message is fully processed at least once.
	 */
	private SpoutOutputCollector collector;
	//private OutputCollector collector;
	
	//准备测试数据
	private String[] sentences={
			"my dog has fleas",
			"i like cold beverages",
			"the dog ate my homework",
			"don't have a cow man",
			"i don't think i like fleas"};
	
	private int index=0;
	
	/**
	 * private Map<String, StreamInfo> _fields = new HashMap<>();
	 * public void declareStream(String streamId, boolean direct, Fields fields) {
     *   if(_fields.containsKey(streamId)) {
     *       throw new IllegalArgumentException("Fields for " + streamId + " already set");
     *    }
     *   _fields.put(streamId, new StreamInfo(fields.toList(), direct));
     * }
	 */
	
	@Override
	public void declareOutputFields(OutputFieldsDeclarer declarer) {
		declarer.declare(new Fields("sentences"));
	}
	
	
	/**
	 * open方法在ISpout接口中定义,所有Spout组件在初始化时调用这个方法,open()方法接收三个参数
	 * 一个包含了Storm配置信息的map
	 * TopologyContext对象提供了topology中组件的信息
	 * SpoutOutputCollector对象提供了发射tuple的方法
	 */
	public void open(Map conf, TopologyContext context, SpoutOutputCollector collector) {
		this.collector=collector;
	}
	
	/**
	 * 是所有spout实现的核心所在,Storm通过调用这个方法向输出的collector发射tuple
	 */
	public void nextTuple() {
		try {
			Thread.sleep(100);
		} catch (InterruptedException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		this.collector.emit(new Values(sentences[index]));
		//System.out.println("===============");
		index++;
		if(index>=sentences.length){
			index=0;
		}
	}
}

public class SplitSentenceBolt extends BaseRichBolt{

	private static final long serialVersionUID = 1L;
	private OutputCollector collector;
	
	public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
		this.collector=collector;
	}

	public void execute(Tuple input) {
		String sentence=input.getStringByField("sentences");
		String[] words=sentence.split(" ");
		for(String word :words){
			this.collector.emit(new Values(word));
		}
	}

	public void declareOutputFields(OutputFieldsDeclarer declarer) {
		declarer.declare(new Fields("words"));
	}
}

public class WordCountBolt extends BaseRichBolt{

	private static final long serialVersionUID = 1L;
	
	private OutputCollector collector;
	private HashMap<String,Long> counts=null;
	
	/**
	 * 通常情况下最好是在构造函数中对基本数据类型和可序列化的对象进行赋值和实例化
	 * 在prepare()方法中对不可序列化的对象进行实例化
	 */
	public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
		this.collector=collector;
		this.counts=new HashMap<String,Long>();
	}
	public void execute(Tuple input) {
		String word=input.getStringByField("words");
		Long count=this.counts.get(word);
		if(count==null){
			count=0L;
		}
		count++;
		//出现就添加到map中,word相同的,会覆盖掉 所以最后的word就是准确的数据
		this.counts.put(word,count);
		this.collector.emit(new Values(word,count));
	}
	public void declareOutputFields(OutputFieldsDeclarer declarer) {
		declarer.declare(new Fields("word","count"));
	}	
}

public class ReportBolt extends BaseRichBolt{

	private static final long serialVersionUID = 1L;
	
	private HashMap<String,Long> counts=null;

	public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
		this.counts=new HashMap<String,Long>();
	}

	public void execute(Tuple input) {
		String word=input.getStringByField("word");
		Long count=input.getLongByField("count");
		this.counts.put(word, count);
		
		
		System.out.println("--------FINAL COUNTS--------");
		List<String> keys=new ArrayList<String>();
		keys.addAll(this.counts.keySet());
		Collections.sort(keys);
		for(String key:keys){
			System.out.println(key+":"+this.counts.get(key));
		}
		System.out.println("----------------------------");
		
	}

	public void declareOutputFields(OutputFieldsDeclarer declarer) {
		// this bolt does not emit anything
	}
}

public class WordCountTopology{
	
	private static final String SENTENCE_SPOUT_ID="sentence-sput";
	private static final String SPLIT_BOLT_ID="split-bolt";
	private static final String COUNT_BOLT_ID="count-bolt";
	private static final String REPORT_BOLT_ID="report-bolt";
	private static final String TOPOLOGY_NAME="word-count-topology";
	
	public static void main(String[] args) throws InterruptedException {
		SentenceSpout spout=new SentenceSpout();
		SplitSentenceBolt splitbolt=new SplitSentenceBolt();
		WordCountBolt countbolt=new WordCountBolt();
		ReportBolt reportbolt=new ReportBolt();
		
		TopologyBuilder builder=new TopologyBuilder();
		// 设置并发为2个executor,每个Task指派各自的executor线程
		builder.setSpout(SENTENCE_SPOUT_ID,spout,2);
		// 设置并发为2个executor,每个executor执行2个task
		builder.setBolt(SPLIT_BOLT_ID,splitbolt,2).setNumTasks(4).shuffleGrouping(SENTENCE_SPOUT_ID);
		// 有时候我们需要将特定数据的tuple路由到特殊的bolt实例中,在此我们使用fieldsGrouping
		// 来保证所有"word"字段值相同的tuple会被路由到同一个WordCountBolt实例中
		builder.setBolt(COUNT_BOLT_ID,countbolt,2).fieldsGrouping(SPLIT_BOLT_ID,new Fields("words"));
		builder.setBolt(REPORT_BOLT_ID,reportbolt).globalGrouping(COUNT_BOLT_ID);
		
		/*Map conf=new HashMap();
		conf.put(Config.TOPOLOGY_WORKERS,4);
		conf.put(Config.TOPOLOGY_DEBUG,true);*/
		
		Config conf = new Config();
	    //conf.setDebug(true);
		LocalCluster cluster=new LocalCluster();
		cluster.submitTopology(TOPOLOGY_NAME,conf,builder.createTopology());
		
//		Thread.sleep(1000);
//		cluster.shutdown();
		
	}
}