大数据_storm下



一、开发WordCount程序:实时计算

这里写图片描述



二、Storm任务提交的过程

这里写图片描述



三、Storm内部通信的机制:有Work中的Executor来执行

这里写图片描述



四、外部系统的集成
    1、流式计算系统的典型的架构:

         数据源(网站) -----> Flume ----> Kafka(topic广播) -----> Storm           ----|
                                                                |--> Spark Streaming ----|----> Redis、HBase、HDFS、JDBC
                                                                |--> Flink           ----|     (Hive、Kafka、JMS)
package demo;

import java.util.Map;
import java.util.Random;

import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichSpout;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Values;
import org.apache.storm.utils.Utils;

/*
 * 作用:采集数据,并且送到下一个bolt组件
 * 模拟:产生一些数据
 */
public class WordCountSpout extends BaseRichSpout{

    //输出流
    private SpoutOutputCollector collector;

    //定义我们的数据
    private String[] data = {"I love Beijing","I love China","Beijing is the capital of China"};

    @Override
    public void nextTuple() {
        //每隔3秒采集一次数据
        Utils.sleep(3000);

        // 由Storm的框架进行调用,用于接收外部系统产生的数据
        //随机产生一个字符串,代表采集的数据
        int random = (new Random()).nextInt(3); //3以内的随机数

        //采集数据,然后发送给下一个组件
        System.out.println("采集的数据是:" + data[random]);
        this.collector.emit(new Values(data[random]));
    }

    /**
     * SpoutOutputCollector collector: spout组件的输出流
     */
    @Override
    public void open(Map arg0, TopologyContext arg1, SpoutOutputCollector collector) {
        // 是spout组件初始化的方法
        this.collector = collector;
    }

    @Override
    public void declareOutputFields(OutputFieldsDeclarer declarer) {
        //用于申明输出的Schema
        declarer.declare(new Fields("sentence"));
    }
}
package demo;
import java.util.Map;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;

/*
 * 第一个Bolt组件:用于拆分单词
 */
public class WordCountSplitBolt extends BaseRichBolt{

    //输出流
    private OutputCollector collector;

    @Override
    public void execute(Tuple tuple) {//组件与组件之间传递的是tuple
        //如何处理上一个组件发来的数据
        //获取数据
        String line = tuple.getStringByField("sentence");

        //分词
        String[] words = line.split(" ");

        //输出
        for(String w:words){
            this.collector.emit(new Values(w,1));
        }
    }


    // OutputCollector collector: bolt组件的输出流
    @Override
    public void prepare(Map arg0, TopologyContext arg1, OutputCollector collector) {
        //初始化bolt组件
        this.collector = collector;
    }

    @Override
    public void declareOutputFields(OutputFieldsDeclarer declarer) {
        //用于申明输出的Schema
        declarer.declare(new Fields("word","count"));
    }

}
package demo;

import java.util.HashMap;
import java.util.Map;

import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;

/*
 * 第二个bolt组件:单词的计数
 */
public class WordCountTotalBolt extends BaseRichBolt{
    //输出流
    private OutputCollector collector;

    //定义一个Map集合,用于保存最后的结果
    private Map<String, Integer> result = new HashMap<>();

    @Override
    public void execute(Tuple tuple) {
        //如何处理上一个组件发来的数据
        //获取数据: 单词、频率:1
        String word = tuple.getStringByField("word");
        int count = tuple.getIntegerByField("count");

        if(result.containsKey(word)){
            //该单词已经存在:累加
            int total = result.get(word);
            result.put(word, total+count);
        }else{
            //该单词不存在
            result.put(word, count);
        }

        //输出
        System.out.println("输出的结果是:" + result);
        //发送下一个组件
        this.collector.emit(new Values(word,result.get(word)));
    }

    @Override
    public void prepare(Map arg0, TopologyContext arg1, OutputCollector collector) {
        // TODO Auto-generated method stub
        this.collector = collector;
    }

    @Override
    public void declareOutputFields(OutputFieldsDeclarer declarer) {
        //用于申明输出的Schema
        declarer.declare(new Fields("word","total"));
    }

}
package demo;

import org.apache.storm.Config;
import org.apache.storm.LocalCluster;
import org.apache.storm.StormSubmitter;
import org.apache.storm.generated.AlreadyAliveException;
import org.apache.storm.generated.AuthorizationException;
import org.apache.storm.generated.InvalidTopologyException;
import org.apache.storm.generated.StormTopology;
import org.apache.storm.hdfs.bolt.HdfsBolt;
import org.apache.storm.hdfs.bolt.format.DefaultFileNameFormat;
import org.apache.storm.hdfs.bolt.format.DelimitedRecordFormat;
import org.apache.storm.hdfs.bolt.rotation.FileSizeRotationPolicy;
import org.apache.storm.hdfs.bolt.rotation.FileSizeRotationPolicy.Units;
import org.apache.storm.hdfs.bolt.sync.CountSyncPolicy;
import org.apache.storm.redis.bolt.RedisStoreBolt;
import org.apache.storm.redis.common.config.JedisPoolConfig;
import org.apache.storm.redis.common.mapper.RedisDataTypeDescription;
import org.apache.storm.redis.common.mapper.RedisStoreMapper;
import org.apache.storm.topology.IRichBolt;
import org.apache.storm.topology.TopologyBuilder;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.ITuple;

public class WordCountTopology {

    public static void main(String[] args) throws Exception {
        // 创建一个任务:Topology = spout + bolt(s)

        TopologyBuilder builder = new TopologyBuilder();

        //设置任务的第一个组件:spout组件
        builder.setSpout("mywordcount_spout", new WordCountSpout());
        //builder.setSpout("mywordcount_spout", createKafkaSpout());

        //设置任务的第二个组件:bolt组件,拆分单词
        builder.setBolt("mywordcount_split", new WordCountSplitBolt()).shuffleGrouping("mywordcount_spout");//数据分组策略:随机分组

        //设置任务的第三个组件:bolt组件,单词计数
        builder.setBolt("mywordcount_total", new WordCountTotalBolt()).fieldsGrouping("mywordcount_split", new Fields("word"));//数据分组策略:按字段分组,分组的字段是什么

        //设置任务的第四个组件:bolt组件,将结果写到Redis中
        //builder.setBolt("mywordcount_redis", createRedisBolt()).shuffleGrouping("mywordcount_total");

        //设置任务的第四个组件:bolt组件,将结果写到HDFS中
        //builder.setBolt("mywordcount_hdfs", createHDFSBolt()).shuffleGrouping("mywordcount_total");

        //设置任务的第四个组件:bolt组件,将结果写到HBase中
        builder.setBolt("mywordcount_hdfs", new WordCountHBaseBolt()).shuffleGrouping("mywordcount_total");     

        //创建任务
        StormTopology topology = builder.createTopology();

        //配置参数
        Config conf = new Config();

        //提交任务: 方式1:本地模式     方式2:集群模式
        //方式1:本地模式 
        LocalCluster cluster = new LocalCluster();
        cluster.submitTopology("mywordcount", conf, topology);
        //1、任务名字;2、配置信息;3、任务是什么

        // 方式2:集群模式: storm jar temp/storm.jar demo.WordCountTopology MyStormWordCount
        //StormSubmitter.submitTopology(args[0], conf, topology);
    }

    private static IRichBolt createHDFSBolt() {
        // 创建一个HDFS的BOlt组件,写入到HDFS
        HdfsBolt bolt = new HdfsBolt();
        //指定HDFS的位置
        bolt.withFsUrl("hdfs://192.168.157.11:9000");
        //数据保存在HDFS的哪个目录
        bolt.withFileNameFormat(new DefaultFileNameFormat().withPath("/stormresult"));
        //指定key和value的分隔符:Beijing|10
        bolt.withRecordFormat(new DelimitedRecordFormat().withFieldDelimiter("|"));

        //生成文件的策略
        bolt.withRotationPolicy(new FileSizeRotationPolicy(5.0f,Units.MB));

        //与HDFS进行同步的策略:当tuple的数据达到1K?????
        bolt.withSyncPolicy(new CountSyncPolicy(1024));

        return bolt;
    }

    private static IRichBolt createRedisBolt() {
        // 创建一个Redis的bolt组件,将数据写到Redis中
        //创建一个Redis的连接池
        JedisPoolConfig.Builder builder = new JedisPoolConfig.Builder();
        builder.setHost("192.168.157.11");
        builder.setPort(6379);
        JedisPoolConfig poolConfig = builder.build();

        //storeMapper: 存入Redis中数据的格式
        return new RedisStoreBolt(poolConfig, new RedisStoreMapper() {

            @Override
            public RedisDataTypeDescription getDataTypeDescription() {
                // 申明存入Redi的数据的类型
                return new RedisDataTypeDescription(RedisDataTypeDescription.RedisDataType.HASH,"wordcount");
            }

            @Override
            public String getValueFromTuple(ITuple tuple) {
                // 从上一个组件接收的value
                return String.valueOf(tuple.getIntegerByField("total"));
            }

            @Override
            public String getKeyFromTuple(ITuple tuple) {
                // 从上一个组件接收的key
                return tuple.getStringByField("word");
            }
        });
    }

}
package demo;

import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.tuple.Tuple;

/*
 * 创建一个HBase的表: create 'result','info'
 */
public class WordCountHBaseBolt extends BaseRichBolt {

    //定义一个HBase的客户端
    private HTable table ;

    @Override
    public void execute(Tuple tuple) {
        //得到上一个组件处理的数据
        String word = tuple.getStringByField("word");
        int total = tuple.getIntegerByField("total");

        //创建一个Put对象
        Put put = new Put(Bytes.toBytes(word));
        put.add(Bytes.toBytes("info"), Bytes.toBytes("word"), Bytes.toBytes(word));
        put.add(Bytes.toBytes("info"), Bytes.toBytes("total"), Bytes.toBytes(String.valueOf(total)));
        try{
            table.put(put);
        }catch(Exception ex){
            ex.printStackTrace();
        }

    }

    @Override
    public void prepare(Map arg0, TopologyContext arg1, OutputCollector arg2) {
        // 初始化:指定HBase的相关信息
        //指定ZK的地址
        Configuration conf = new Configuration();
        conf.set("hbase.zookeeper.quorum", "192.168.157.11");
        try{
            table = new HTable(conf, "result");
        }catch(Exception ex){
            ex.printStackTrace();
        }
    }

    @Override
    public void declareOutputFields(OutputFieldsDeclarer arg0) {
        // TODO Auto-generated method stub

    }

}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值