storm的组件可以和HDFS系统进行交互。
使用:
以下例子用竖线 | 作为分隔符,把文件写到HDFS的路径上去,一次同步每一千个tuple.每5MB滚动一次文件。
先在HDFS上创建路径
hadoop fs -mkdir /storm_write_hdfs
设置路径权限
hadoop fs -chmod -R 777 /storm_write_hdfs
pom.xml
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.imooc.bigdata</groupId>
<artifactId>storm</artifactId>
<version>1.0</version>
<packaging>jar</packaging>
<name>storm</name>
<url>http://maven.apache.org</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<storm.version>1.1.1</storm.version>
<hadoop.version>2.6.0-cdh5.7.0</hadoop.version>
</properties>
<!--添加cloudera的repository-->
<repositories>
<repository>
<id>cloudera</id>
<url>https://repository.cloudera.com/artifactory/cloudera-repos</url>
</repository>
</repositories>
<dependencies>
<dependency>
<groupId>org.apache.storm</groupId>
<artifactId>storm-core</artifactId>
<version>${storm.version}</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>log4j-over-slf4j</artifactId>
</exclusion>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.4</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
<exclusions>
<exclusion>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.storm</groupId>
<artifactId>storm-hdfs</artifactId>
<version>${storm.version}</version>
<exclusions>
<exclusion>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-auth</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>16.0.1</version>
</dependency>
<dependency>
<groupId>org.apache.curator</groupId>
<artifactId>curator-client</artifactId>
<version>2.12.0</version>
</dependency>
</dependencies>
</project>
代码
package com.imooc.bigdata.integration.hdfs;
import org.apache.storm.Config;
import org.apache.storm.LocalCluster;
import org.apache.storm.hdfs.bolt.HdfsBolt;
import org.apache.storm.hdfs.bolt.format.DefaultFileNameFormat;
import org.apache.storm.hdfs.bolt.format.DelimitedRecordFormat;
import org.apache.storm.hdfs.bolt.format.FileNameFormat;
import org.apache.storm.hdfs.bolt.format.RecordFormat;
import org.apache.storm.hdfs.bolt.rotation.FileRotationPolicy;
import org.apache.storm.hdfs.bolt.rotation.FileSizeRotationPolicy;
import org.apache.storm.hdfs.bolt.sync.CountSyncPolicy;
import org.apache.storm.hdfs.bolt.sync.SyncPolicy;
import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.TopologyBuilder;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.topology.base.BaseRichSpout;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
import org.apache.storm.utils.Utils;
import java.util.Map;
import java.util.Random;
/**
* 报错:
* Caused by: org.apache.hadoop.ipc.RemoteException:
* Permission denied: user=jy02268879, access=WRITE, inode="/":hadoop:supergroup:drwxr-xr-x
* 这里解决这个报错直接把HDFS的路径权限设置成777
* hadoop fs -chmod -R 777 /
*
*/
public class LocalWordCountHDFSStormTopology {
public static class DataSourceSpout extends BaseRichSpout {
private SpoutOutputCollector collector;
public void open(Map conf, TopologyContext context, SpoutOutputCollector collector) {
this.collector = collector;
}
public static final String[] words = new String[]{"apple","orange","pineapple", "banana", "watermelon"};
public void nextTuple() {
Random random = new Random();
String word = words[random.nextInt(words.length)];
this.collector.emit(new Values(word));
System.out.println("emit: " + word);
Utils.sleep(200);
}
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("line"));
}
}
/**
* 对数据进行分割
*/
public static class SplitBolt extends BaseRichBolt {
private OutputCollector collector;
public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
this.collector = collector;
}
/**
* 业务逻辑:
*/
public void execute(Tuple input) {
String word = input.getStringByField("line");
this.collector.emit(new Values(word));
}
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("word"));
}
}
public static void main(String[] args) {
// 通过TopologyBuilder根据Spout和Bolt构建Topology
TopologyBuilder builder = new TopologyBuilder();
builder.setSpout("DataSourceSpout", new DataSourceSpout());
builder.setBolt("SplitBolt", new SplitBolt()).shuffleGrouping("DataSourceSpout");
// use "|" instead of "," for field delimiter
RecordFormat format = new DelimitedRecordFormat()
.withFieldDelimiter("|");
// sync the filesystem after every 100 tuples
//每100条同步到HDFS上
SyncPolicy syncPolicy = new CountSyncPolicy(10);
// rotate files when they reach 5MB
//写到HDFS上的文件,每个文件写了5MB才会写到新文件中
FileRotationPolicy rotationPolicy = new FileSizeRotationPolicy(5.0f, FileSizeRotationPolicy.Units.MB);
FileNameFormat fileNameFormat = new DefaultFileNameFormat()
.withPath("/storm_write_hdfs/");
HdfsBolt bolt = new HdfsBolt()
.withFsUrl("hdfs://hadoopcluster")//hdfs HA要把hdfs-sit.xml文件放到项目的resources下
.withFileNameFormat(fileNameFormat)
.withRecordFormat(format)
.withRotationPolicy(rotationPolicy)
.withSyncPolicy(syncPolicy);
builder.setBolt("HdfsBolt", bolt).shuffleGrouping("SplitBolt");
// 创建本地集群
LocalCluster cluster = new LocalCluster();
cluster.submitTopology("LocalWordCountHDFSStormTopology",
new Config(), builder.createTopology());
}
}
这里的HDFS做了HA的,所以要把hdfs-site.xml文件放到项目的resources下,不然会报错找不到hadoocluster(这是HA的入口)
项目目录
运行
查看结果