Kafka+Storm+Hbase 流处理Demo

运行环境:

三台Ubuntu16.04

Storm 1.2.2

Hbase 1.4.9

Kafka 2.11-2.1.1

 

所需的依赖:

<dependencies>
        <dependency>
            <groupId>org.apache.storm</groupId>
            <artifactId>storm-core</artifactId>
            <version>1.1.1</version>
            <scope>provided</scope>
        </dependency>
        <dependency>
            <groupId>org.apache.kafka</groupId>
            <artifactId>kafka-clients</artifactId>
            <version>0.11.0.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-server</artifactId>
            <version>0.96.1-hadoop2</version>
        </dependency>
    </dependencies>

    <build>
        <plugins>
            <plugin>
                <artifactId>maven-assembly-plugin</artifactId>
                <configuration>
                    <descriptorRefs>
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                    <archive>
                        <manifest>
                            <!-- 此处指定main方法入口的class -->
                            <mainClass>com.jinxLbj.MyTopology</mainClass>
                        </manifest>
                    </archive>
                </configuration>
                <executions>
                    <execution>
                        <id>make-assembly</id>
                        <!-- bind to the packaging phase -->
                        <phase>package</phase>
                        <goals>
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>

 

首先我们如果想进行流处理,我们必须要去拥有一个数据源,这里我们使用Kafka作为消息队列,模拟不断流入的数据

1、Kafka

生产者(向消息队列无限提交数据):

public class MyProducer {

    public static void main(String[] args) throws InterruptedException {
        Properties properties = new Properties();
        //Zookeeper 地址与端口
        properties.put("bootstrap.servers", "10.96.129.36:9092");
        properties.put("acks", "all");
        properties.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
        properties.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");
        Producer<String, String> producer = new KafkaProducer<String, String>(properties);
        while(true) {
            //模拟发送以空格分割的随机数字字符串
            producer.send(new ProducerRecord<String, String>("IntCount",
                    String.valueOf(new Random().nextInt(10)) + " "
                            + String.valueOf(new Random().nextInt(10))  + " "
                            + String.valueOf(new Random().nextInt(10))  + " "
                            + String.valueOf(new Random().nextInt(10))  + " "
                            + String.valueOf(new Random().nextInt(10))  + " "
                            + String.valueOf(new Random().nextInt(10))  + " "
                            + String.valueOf(new Random().nextInt(10))  + " "
                            + String.valueOf(new Random().nextInt(10))  + " "
                            + String.valueOf(new Random().nextInt(10))  + " "
                            + String.valueOf(new Random().nextInt(10))  + " "
                            + String.valueOf(new Random().nextInt(10))  + " " )
            );
            Thread.sleep(1000);
        }

    }


}

消费者(接受消息队列的数据,提交到Storm):

public class MyConsumer {

    private static KafkaConsumer kafkaConsumer;

    private static Thread myThread;

    public static void init(final String topic) {
        Properties properties = new Properties();
        //此为Kafka所安装Zookeeper的端口
        properties.put("bootstrap.servers", "10.96.129.36:9092");
        properties.put("group.id", "group-1");
        properties.put("enable.auto.commit", "true");
        properties.put("auto.commit.interval.ms", "1000");
        //当各分区下有已提交的offset时,从提交的offset开始消费;无提交的offset时,从头开始消费 
        properties.put("auto.offset.reset", "earliest");
        //设置超时时间30秒
        properties.put("session.timeout.ms", "30000");
        //设置序列化与反序列化
        properties.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
        properties.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
        kafkaConsumer = new KafkaConsumer<String, String>(properties);
        //开启一个新的进程,用来模拟循环从队列中取数据
        myThread = new Thread(new Runnable() {
            public void run() {
                //topic为所订阅的主题
                kafkaConsumer.subscribe(Arrays.asList(topic));
                while (true) {
                    ConsumerRecords<String, String> records = kafkaConsumer.poll(100);
                    for (ConsumerRecord<String, String> record : records) {
                        MySpout.messages.push(record.value());
                    }
                }
            }
        });
        myThread.start();
    }

    public static void close() {
        kafkaConsumer.close();
    }

}

声明数据处理过后的去处

2、Hbase

public class MyHBase {

    private static HBaseAdmin admin = null;

    private static Configuration configuration = null;

    //初始化Hbase数据
    public static void init() throws IOException {
        configuration = HBaseConfiguration.create();
        //此为Hadoop分布式存储hbase元数据的位置
        configuration.set("hbase.rootdir", "hdfs://master:9000/opt/hbase/hbase_db");
        //此为zookeeper的自治
        configuration.set("hbase.zookeeper.quorum", "10.96.129.36, 10.96.129.41, 10.96.129.43");
        admin = new HBaseAdmin(configuration);
    }

    //创建表
    public static void createTable(String tableName, String[] strs) throws IOException {
        if (!admin.tableExists(tableName)) {
            HColumnDescriptor[] hcDes = new HColumnDescriptor[strs.length];
            for (int i = 0; i < strs.length; i++) {
                hcDes[i] = new HColumnDescriptor(strs[i]);
            }
            HTableDescriptor tblDes = new HTableDescriptor(TableName.valueOf(tableName));
            for (HColumnDescriptor hc : hcDes) {
                tblDes.addFamily(hc);
            }
            admin.createTable(tblDes);
            System.out.println(tableName + " create successfully!");
        } else {
            System.out.println(tableName + " already existed!");
        }
    }

    //添加或修改数据
    public static boolean put(String tableName, String rowkey, String columnFamily, String qualifier, String value) {
        try {
            HTable table = new HTable(configuration, tableName);
            Put put = new Put(rowkey.getBytes());
            put.add(columnFamily.getBytes(), qualifier.getBytes(), value.getBytes());
            table.put(put);
            System.out.println("put successfully! " + rowkey + "," + columnFamily + "," + qualifier + "," + value);
        } catch (IOException e) {
            e.printStackTrace();
            return false;
        }
        return true;
    }

    // 查询
    public static Result getResult(String tableName, String rowkey) {
        System.out.println("get result. table=" + tableName + " rowkey=" + rowkey);
        try {
            HTable table = new HTable(configuration, tableName);
            Get get = new Get(rowkey.getBytes());
            return table.get(get);
        } catch (IOException e) {
            e.printStackTrace();
            return null;
        }
    }

    // Result转换成Map形式,便于输出
    private static Map<String, Object> result2Map(Result result) {
        Map<String, Object> ret = new HashMap<String, Object>();
        if (result != null && result.listCells() != null) {
            for (Cell cell : result.listCells()) {
                String key = Bytes.toString(CellUtil.cloneQualifier(cell));
                String value = Bytes.toString(CellUtil.cloneValue(cell));
                System.out.println(key + " => " + value);
                ret.put(key, value);
            }
        }
        return ret;
    }

    public static void scan(String tableName) {
        System.out.println("scan table " + tableName);
        try {
            HTable table = new HTable(configuration, tableName);
            Scan scan = new Scan();
            ResultScanner rs = table.getScanner(scan);
            Integer total = 0;
            for (Result r : rs) {
                String num = Bytes.toString(r.getValue(Bytes.toBytes("num"), Bytes.toBytes("num1")));
                System.out.println(Bytes.toString(r.getRow()) + " -> " +
                        num);
                total += Integer.valueOf(num);
            }
            System.out.println("total = " + total);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    // 删除表
    public static boolean delete(String tableName) {
        System.out.println("delete table " + tableName);
        try {
            HBaseAdmin admin = new HBaseAdmin(configuration);
            if (admin.tableExists(tableName)) {
                admin.disableTable(tableName);
                admin.deleteTable(tableName);
            }
            return true;
        } catch (IOException e) {
            e.printStackTrace();
            return false;
        }
    }

    public static void main(String[] args) throws IOException {
        init();
        scan("IntCount");
    }

}

当拿到数据并且声明好数据处理过后的去处,就应该开始着手操作数据了,如下是Storm代码段

3、Storm

MySpout(整个数据处理的数据源,接受从Kafka中拿到的数据,提交进行数据处理)

public class MySpout extends BaseRichSpout {

    private static final long serialVersionUID = 225243592780939490L;

    private SpoutOutputCollector collector;
    private static final String field = "word";
    private int count = 0;
    //数据栈,Kafka的消费者类获取到数据即将数据压入栈
    public static Stack<String> messages = new Stack<String>();

    //初始化方法,在初始化方法中初始化Kafka的消费者类,初始化Hbase,方便数据处理后存入
    public void open(Map map, TopologyContext topologyContext, SpoutOutputCollector spoutOutputCollector) {
        this.collector = spoutOutputCollector;
        //初始化消费者类,IntCount为主题名
        MyConsumer.init("IntCount");
        try {
            //初始化Hbase(代码在后方)
            MyHBase.init();
            //在Hbase创建一个名为IntCount的表,包括列簇 "num"
            MyHBase.createTable("IntCount", new String[]{"num"});
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    //在此处将数据发送给而后的处理类
    public void nextTuple() {
        if (!messages.empty()) {
            String msg = messages.pop();
            this.collector.emit(new Values(msg));
        }
    }

    public void declareOutputFields(OutputFieldsDeclarer declarer) {
        System.out.println("定义格式...");
        //定义输出数据的格式
        declarer.declare(new Fields(field));
    }

    @Override
    public void fail(Object obj) {
        System.out.println("失败:" + obj);
    }

    @Override
    public void close() {
        System.out.println("关闭...");
    }
}

MyBolt(从MySpout中获取数据,而后对数据进行处理)

public class MyBolt extends BaseRichBolt {

    private static final long serialVersionUID = 4743224635827696343L;

    private OutputCollector collector;

    //MyBolt的初始化方法
    public void prepare(Map map, TopologyContext topologyContext, OutputCollector outputCollector) {
        this.collector = outputCollector;
    }

    //数据处理(核心)
    public void execute(Tuple tuple) {
        //在Spout中我们定义了数据格式只有一个word,所以我们拿到Tuple的word
        //此word应该为一个被空格分割的若干数字
        String msg = tuple.getStringByField("word");
        //将数字分割
        String[] words = msg.toLowerCase().split(" ");
        //将分割好的数据继续发射到下一个处理类进行处理(即为Bolt2)
        for (String word : words) {
            System.out.println("split " + word);
            this.collector.emit(new Values(word));//向下一个bolt发射数据
        }
    }

    //若还有数据输出,则规定数据的格式
    public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
        outputFieldsDeclarer.declare(new Fields("word"));
    }

    @Override
    public void cleanup() {
        System.out.println("资源释放");
    }
}

Bolt2(对数字进行分割之后,要对数字进行统计,此类用来做这种工作)

public class MyBolt2 extends BaseRichBolt {

    private static final long serialVersionUID = 4743224635827696343L;

    private OutputCollector collector;

    //初始化
    public void prepare(Map map, TopologyContext topologyContext, OutputCollector outputCollector) {
        this.collector = outputCollector;
    }

    public void execute(Tuple tuple) {
        //此处msg应为被Bolt类处理过后的单独数字
        String msg = tuple.getStringByField("word");
        //我们从Hbase中的IntCount表,查询以此数字为行标的记录
        Result r = MyHBase.getResult("IntCount", msg);
        assert r != null;
        //获取此数字已经出现的次数
        String numS = Bytes.toString(r.getValue(Bytes.toBytes("num"), Bytes.toBytes("num1")));
        //若数字为空,则认为这是第一次出现,将结果存入Hbase
        if (numS == null || "".equals(numS)) {
            MyHBase.put("IntCount", msg, "num", "num1", "1");
        } else {
            //若数字不为空,则自增,存入Hbase
            Integer num = Integer.valueOf(numS);
            MyHBase.put("IntCount", msg, "num", "num1", String.valueOf(num + 1));
        }
    }

    //如还有后续操作,声明格式,此Demo已无后续操作
    public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
        outputFieldsDeclarer.declare(new Fields("word"));
    }

    @Override
    public void cleanup() {
        System.out.println("资源释放");
    }
}

MyTopology(我们将刚刚写成的数据处理,经过这个类组成一个流程)

public class MyTopology {

    public static void main(String[] args) {
        // TODO Auto-generated method stub
        //定义一个拓扑
        TopologyBuilder builder = new TopologyBuilder();
        //设置一个Executeor(线程),默认一个
        builder.setSpout("spout1", new MySpout());
        //设置一个Executeor(线程),和一个task
        //bolt1从spout1获取数据
        builder.setBolt("bolt1", new MyBolt(), 1).setNumTasks(1).shuffleGrouping("spout1");    
        //bolt2从bolt1获取数据
        builder.setBolt("bolt2", new MyBolt2(), 1).setNumTasks(1).fieldsGrouping("bolt1", new Fields("word"));
        Config conf = new Config();
        conf.put("test", "test");
        try {
            System.out.println("远程模式");
            StormSubmitter.submitTopology(args[0], conf, builder.createTopology());
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

}

四、运行

打包此项目,上传到安装Storm的服务器上。运行命令

storm jar jar包位置 入口类 自定义进程名

 例:storm jar storm_study_1.jar com.jinxLbj.MyTopology IntCount

而后应该显示Finished submitting topology: IntCount

 

五、测试

运行最上方Kafka的生产者的main方法

等待几秒

运行HBase的查询方法,查看结果

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值