package com.uplooking.bigdata.storm.local;
import org.apache.storm.Config;
import org.apache.storm.LocalCluster;
import org.apache.storm.generated.StormTopology;
import org.apache.storm.shade.org.apache.commons.io.FileUtils;
import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.TopologyBuilder;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.topology.base.BaseRichSpout;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
import java.io.File;
import java.util.Collection;
import java.util.List;
import java.util.Map;
/**
* 单词计数:监控一个目录下的文件,当发现有新文件的时候,
把文件读取过来,解析文件中的内容,统计单词出现的总次数
E:/test/storm/a.txt---->需要让spout去监听该目录,有新文件产生
hello you
hello me
hello he
将新文件读到内存中,将其中的数据发送给下游的bolt,进行处理
第一步,将读到这这些数据单词的拆分hello you--->转换成两个单词hello,you
第二步,统计单词出现的个数
一般我们用一个bolt干一件事,所以我们在这里有两个bolt,第一个做单词拆分,第二个做单词计数
*/
public class LocalWordCountTopology {
//做数据源,监听目录,当有新文件产生,读取其中的内容,发送到下游bolt
static class WCSpout extends BaseRichSpout {
private Map conf;
private TopologyContext context;
private SpoutOutputCollector collector;
/**
* 这是一个生命周期方法,一个SumNumSpout实例只运行一次,主要完成初始化的参数设置
* @param conf ---->storm程序以及storm集群相关的配置信息
* @param context ---->整个Topology上下文对象,可以通过该context获得相关topology应用属性
* @param collector ---->主要用于收集数据,并将数据发射到下一个阶段
*/
public void open(Map conf, TopologyContext context, SpoutOutputCollector collector) {
this.conf = conf;
this.context = context;
this.collector = collector;
}
//监听一个目录新文件的产生
public void nextTuple() {
/**
* File directory ----> 要要监控的目录对象
* String[] extensions ---->要监控的目录下面以什么结尾(说白了就是扩展名)的文件
* 注意,写文件扩展名的时候不能写"."
* boolean recursive ---->是否递归遍历
*/
Collection<File> files = FileUtils.listFiles(new File("E:/test/storm"),
new String[]{"txt", "log", "csv"}, true);
List<String> lines = null;
try {
for (File file : files) {
// BufferedReader br = new BufferedReader(new FileReader(file));
// String line = null;
// while((line = br.readLine()) != null) {
// collector.emit(new Values(line));
// }
lines = FileUtils.readLines(file, "UTF-8");
for (String line : lines) {
System.out.println("spout读取到的内容:" + line);
collector.emit(new Values(line));
}
//读取完成一个文件之后,将其重命名,避免下次再读
FileUtils.moveFile(file, new File(file.getAbsolutePath() + "." + System.currentTimeMillis()));
}
}catch (Exception e) {
// e.printStackTrace();//这里就不用输出异常信息了
}
}
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("line"));
}
}
//读取上述spout发送过来的tuple,对tuple中的数据进行单词拆分,将拆分之后的单词发送给下游bolt
static class SplitBolt extends BaseRichBolt {
private Map conf;
private TopologyContext context;
private OutputCollector collector;
public void prepare(Map conf, TopologyContext context, OutputCollector collector) {
this.conf = conf;
this.context = context;
this.collector = collector;
}
public void execute(Tuple tuple) {
String line = tuple.getStringByField("line");
String[] splits = line.split(" ");
for (String word : splits) {
collector.emit(new Values(word, 1));
}
}
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("word", "times"));
}
}
//接收上游bolt发送过来的单词,对单词进行统计
static class WordCountBolt extends BaseRichBolt {
private Map conf;
private TopologyContext context;
private OutputCollector collector;
public void prepare(Map conf, TopologyContext context, OutputCollector collector) {
this.conf = conf;
this.context = context;
this.collector = collector;
}
int sum = 0;
public void execute(Tuple tuple) {
String word = tuple.getStringByField("word");
int times = tuple.getIntegerByField("times");
sum += times;
System.out.println("截止到目前为止出现的单词个数:" + sum);
}
public void declareOutputFields(OutputFieldsDeclarer declarer) {
}
}
public static void main(String[] args) {
TopologyBuilder builder = new TopologyBuilder();
//设置spout和bolt
builder.setSpout("wcSpout_id", new WCSpout());
builder.setBolt("splitBolt_id", new SplitBolt()).shuffleGrouping("wcSpout_id");
builder.setBolt("wcBolt_id", new WordCountBolt()).shuffleGrouping("splitBolt_id");
StormTopology stormTopology = builder.createTopology();
LocalCluster lCluster = new LocalCluster();
String topologyName = LocalWordCountTopology.class.getSimpleName();
Config config = new Config();
lCluster.submitTopology(topologyName, config, stormTopology);
}
}
import org.apache.storm.Config;
import org.apache.storm.LocalCluster;
import org.apache.storm.generated.StormTopology;
import org.apache.storm.shade.org.apache.commons.io.FileUtils;
import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.TopologyBuilder;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.topology.base.BaseRichSpout;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
import java.io.File;
import java.util.Collection;
import java.util.List;
import java.util.Map;
/**
* 单词计数:监控一个目录下的文件,当发现有新文件的时候,
把文件读取过来,解析文件中的内容,统计单词出现的总次数
E:/test/storm/a.txt---->需要让spout去监听该目录,有新文件产生
hello you
hello me
hello he
将新文件读到内存中,将其中的数据发送给下游的bolt,进行处理
第一步,将读到这这些数据单词的拆分hello you--->转换成两个单词hello,you
第二步,统计单词出现的个数
一般我们用一个bolt干一件事,所以我们在这里有两个bolt,第一个做单词拆分,第二个做单词计数
*/
public class LocalWordCountTopology {
//做数据源,监听目录,当有新文件产生,读取其中的内容,发送到下游bolt
static class WCSpout extends BaseRichSpout {
private Map conf;
private TopologyContext context;
private SpoutOutputCollector collector;
/**
* 这是一个生命周期方法,一个SumNumSpout实例只运行一次,主要完成初始化的参数设置
* @param conf ---->storm程序以及storm集群相关的配置信息
* @param context ---->整个Topology上下文对象,可以通过该context获得相关topology应用属性
* @param collector ---->主要用于收集数据,并将数据发射到下一个阶段
*/
public void open(Map conf, TopologyContext context, SpoutOutputCollector collector) {
this.conf = conf;
this.context = context;
this.collector = collector;
}
//监听一个目录新文件的产生
public void nextTuple() {
/**
* File directory ----> 要要监控的目录对象
* String[] extensions ---->要监控的目录下面以什么结尾(说白了就是扩展名)的文件
* 注意,写文件扩展名的时候不能写"."
* boolean recursive ---->是否递归遍历
*/
Collection<File> files = FileUtils.listFiles(new File("E:/test/storm"),
new String[]{"txt", "log", "csv"}, true);
List<String> lines = null;
try {
for (File file : files) {
// BufferedReader br = new BufferedReader(new FileReader(file));
// String line = null;
// while((line = br.readLine()) != null) {
// collector.emit(new Values(line));
// }
lines = FileUtils.readLines(file, "UTF-8");
for (String line : lines) {
System.out.println("spout读取到的内容:" + line);
collector.emit(new Values(line));
}
//读取完成一个文件之后,将其重命名,避免下次再读
FileUtils.moveFile(file, new File(file.getAbsolutePath() + "." + System.currentTimeMillis()));
}
}catch (Exception e) {
// e.printStackTrace();//这里就不用输出异常信息了
}
}
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("line"));
}
}
//读取上述spout发送过来的tuple,对tuple中的数据进行单词拆分,将拆分之后的单词发送给下游bolt
static class SplitBolt extends BaseRichBolt {
private Map conf;
private TopologyContext context;
private OutputCollector collector;
public void prepare(Map conf, TopologyContext context, OutputCollector collector) {
this.conf = conf;
this.context = context;
this.collector = collector;
}
public void execute(Tuple tuple) {
String line = tuple.getStringByField("line");
String[] splits = line.split(" ");
for (String word : splits) {
collector.emit(new Values(word, 1));
}
}
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("word", "times"));
}
}
//接收上游bolt发送过来的单词,对单词进行统计
static class WordCountBolt extends BaseRichBolt {
private Map conf;
private TopologyContext context;
private OutputCollector collector;
public void prepare(Map conf, TopologyContext context, OutputCollector collector) {
this.conf = conf;
this.context = context;
this.collector = collector;
}
int sum = 0;
public void execute(Tuple tuple) {
String word = tuple.getStringByField("word");
int times = tuple.getIntegerByField("times");
sum += times;
System.out.println("截止到目前为止出现的单词个数:" + sum);
}
public void declareOutputFields(OutputFieldsDeclarer declarer) {
}
}
public static void main(String[] args) {
TopologyBuilder builder = new TopologyBuilder();
//设置spout和bolt
builder.setSpout("wcSpout_id", new WCSpout());
builder.setBolt("splitBolt_id", new SplitBolt()).shuffleGrouping("wcSpout_id");
builder.setBolt("wcBolt_id", new WordCountBolt()).shuffleGrouping("splitBolt_id");
StormTopology stormTopology = builder.createTopology();
LocalCluster lCluster = new LocalCluster();
String topologyName = LocalWordCountTopology.class.getSimpleName();
Config config = new Config();
lCluster.submitTopology(topologyName, config, stormTopology);
}
}