引言
目前下面的代码在本地集群测试环境中完成了测试,但是还未提交到docker集群中实际运行,这将会在完成大作业时更新。
前提
storm集群
kafka集群
zookeeper集群
三个集群的搭建可以查看秃头的其他文章
需求
作业要求使用storm集群对10份股票数据文件进行统计,数据集中包含10支股票,包含股票金额、成交单价、成交量、成交时间等字段。
主要实现两个任务:
1、对不同类型的股票的成交总金额、成交量进行实时统计
2、对不同类型的股票每小时的交总金额、成交量进行实时统计
思路
通过kafka读取文件,将数据以消息的形式发送给spout,而后由spout将数据发给一个父bolt,由父bolt预处理数据后,分发给两个子bolt实现具体的任务。
代码
依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.example</groupId>
<artifactId>kafkaConsume</artifactId>
<version>1.0-SNAPSHOT</version>
</parent>
<artifactId>storm</artifactId>
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<!-- Lombok -->
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.12</version>
</dependency>
<!-- Junit -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.13</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>2.4.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.storm/storm-core -->
<dependency>
<groupId>org.apache.storm</groupId>
<artifactId>storm-core</artifactId>
<version>2.2.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.storm/storm-kafka-client -->
<dependency>
<groupId>org.apache.storm</groupId>
<artifactId>storm-kafka-client</artifactId>
<version>2.2.0</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.47</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-csv -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-csv</artifactId>
<version>1.8</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>2.5.5</version>
<configuration>
<archive>
<manifest>
<mainClass>com.yj.TCPClient.upload.App</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<!-- 添加此项后,可直接使用mvn package | mvn install -->
<!-- 不添加此项,需直接使用mvn package assembly:single -->
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
kafka生产者
package org.example.kafkaPC;
import com.sun.org.apache.bcel.internal.generic.ACONST_NULL;
import org.apache.commons.csv.CSVRecord;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerConfig;
import org.apache.kafka.clients.producer.ProducerRecord;
import org.apache.kafka.common.serialization.StringSerializer;
import org.example.utils.ReadFile;
import org.example.utils.TimeFormatter;
import java.util.*;
import java.util.stream.IntStream;
public class Producer {
// 创建kafka生产者
// 读取csv文件
// 取出trade_volume进行类加
// 创建kafka消费者
public void producer() {
// 初始文件目录
List<String> filePaths = new ArrayList<>();
filePaths.add("/home/veteran/data/fileData/股票数据/股票数据1.csv");
filePaths.add("/home/veteran/data/fileData/股票数据/股票数据2.csv");
filePaths.add("/home/veteran/data/fileData/股票数据/股票数据3.csv");
filePaths.add("/home/veteran/data/fileData/股票数据/股票数据4.csv");
filePaths.add("/home/veteran/data/fileData/股票数据/股票数据5.csv");
filePaths.add("/home/veteran/data/fileData/股票数据/股票数据6.csv");
filePaths.add("/home/veteran/data/fileData/股票数据/股票数据7.csv");
filePaths.add("/home/veteran/data/fileData/股票数据/股票数据8.csv");
filePaths.add("/home/veteran/data/fileData/股票数据/股票数据9.csv");
filePaths.add("/home/veteran/data/fileData/股票数据/股票数据10.csv");
// 创建topic
String topic = "stock_1";
// 创建生产者
Properties properties = new Properties();
properties.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.43.219:9092,192.168.43.219:9093,192.168.43.219:9094");
properties.put(ProducerConfig.LINGER_MS_CONFIG, 5);
properties.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName());
properties.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName());
KafkaProducer<String, String> kProducer = new KafkaProducer<>(properties);
// 创建文件读取器
ReadFile readFile = new ReadFile();
TimeFormatter timeFormatter = new TimeFormatter();
// 循环处理三个文件
IntStream.range(0, filePaths.size()).forEach(index -> {
Iterable<CSVRecord> records = readFile.readCSV(filePaths.get(index));
// 提交topic
for (CSVRecord csvRecord : records) {
try {
// 取出stock_name做为key
String key = csvRecord.get("stock_code");
// 取出trade_volume进行统计
double trade_volume = Double.parseDouble(csvRecord.get("trade_volume"));
// 取出交易单价
float price = Float.parseFloat(csvRecord.get("price"));
String totalPrice = Double.toString(trade_volume * price);
// 取出时间 并且转换为数值
String time = csvRecord.get("time");
// System.out.println(time);
long convertedTime = timeFormatter.convertTime(time);
System.out.println(key+","+totalPrice+","+convertedTime);
ProducerRecord<String, String> kRecord = new ProducerRecord<>(topic, key, totalPrice+","+convertedTime);
// 使用kafka producer发送
kProducer.send(kRecord);
}catch (Exception e){
e.printStackTrace();
}finally {
continue;
}
}
});
// 关闭生产者
kProducer.close();
}
public static void main(String[] args) {
Producer producer = new Producer();
producer.producer();
}
}
Topology(拓扑)代码:
package org.example.storm;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.storm.Config;
import org.apache.storm.LocalCluster;
import org.apache.storm.kafka.spout.KafkaSpout;
import org.apache.storm.kafka.spout.KafkaSpoutConfig;
import org.apache.storm.topology.TopologyBuilder;
import java.util.Collections;
public class Topology {
public static void main(String[] args) throws Exception {
// 配置spout
KafkaSpoutConfig<String, String> kafkaSpoutConfig = KafkaSpoutConfig.builder("192.168.43.219:9092,192.168.43.219:9093,192.168.43.219:9094", "stock_1").setProp(ConsumerConfig.GROUP_ID_CONFIG, "kafkaSpoutTestGroup").build();
// 配置拓扑
TopologyBuilder topologyBuilder = new TopologyBuilder();
// 设置spout
topologyBuilder.setSpout("kafka-spout", new KafkaSpout<>(kafkaSpoutConfig), 1);
// 设置bolt1
topologyBuilder.setBolt("process-bolt", new ProcessingBolt()).shuffleGrouping("kafka-spout");
// 设置bolt2
// topologyBuilder.setBolt("next-bolt1", new NextBolt1(), 3).shuffleGrouping("process-bolt");
// 设置bolt3
topologyBuilder.setBolt("next-bolt2", new NextBolt2(), 1).shuffleGrouping("process-bolt");
// 配置
Config config = new Config();
// 提交topology
// StormSubmitter.submitTopology("stockStatistic", config, topologyBuilder.createTopology());
// 创建本地集群进行测试
LocalCluster cluster = new LocalCluster();
cluster.submitTopology("LocalReadingFromKafkaApp", config, topologyBuilder.createTopology());
}
}
ProcessingBolt代码:
package org.example.storm;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
import java.util.HashMap;
import java.util.Map;
public class ProcessingBolt extends BaseRichBolt {
private OutputCollector collector;
private Map<String, Integer> tradeVolumeMap;
private Map<String, Double> totalPriceMap;
private Map<String, Long> startTimeMap;
private Map<String, Long> accumulateTimeMap;
@Override
public void prepare(Map<String, Object> map, TopologyContext topologyContext, OutputCollector outputCollector) {
this.collector = outputCollector;
// 创建交易量记录map
tradeVolumeMap = new HashMap<>();
// 创建总金额记录map
totalPriceMap = new HashMap<>();
// 创建开始时间记录map
startTimeMap = new HashMap<>();
// 创建累积时间记录器
accumulateTimeMap = new HashMap<>();
}
@Override
public void execute(Tuple tuple) {
// 接受key 根据key分类
String key = tuple.getStringByField("key");
// 接受数据 数据格式为"交易量,时间"
String value = tuple.getStringByField("value");
String[] valueList = value.split(",");
Double totalPrice = Double.parseDouble(valueList[0]);
// System.out.println(totalPrice);
// System.out.println(totalPriceMap.get(key));
long time = Long.parseLong(valueList[1]);
if (tradeVolumeMap.containsKey(key)){
// 累积交易量
tradeVolumeMap.put(key, tradeVolumeMap.get(key) + 1);
// 累积交易金额
totalPriceMap.put(key, totalPriceMap.get(key) + totalPrice);
// 累积时间 当前时间晚于记录的时间
if (time > startTimeMap.get(key) && time > accumulateTimeMap.get(key)){;
accumulateTimeMap.put(key, time);
} else if (time < startTimeMap.get(key)) {
// 当前时间早于记录时间 更新startTimeMap
startTimeMap.put(key, time);
}
}else {
tradeVolumeMap.put(key, 1);
totalPriceMap.put(key, totalPrice);
startTimeMap.put(key, time);
accumulateTimeMap.put(key, time);
}
// 封装map 传向下一个bolt
Values values = new Values(tradeVolumeMap, totalPriceMap, startTimeMap, accumulateTimeMap);
// System.out.println(tradeVolumeMap);
// System.out.println(totalPriceMap);
// System.out.println(startTimeMap);
// System.out.println(accumulateTimeMap);
// System.out.println("----------------------------------");
collector.emit(values);
collector.ack(tuple);
}
@Override
public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
// 声明流 q:要申明一个默认流
outputFieldsDeclarer.declare(new Fields("tradeVolumeMap", "totalPriceMap", "startTimeMap", "accumulateTimeMap"));
}
}
Next-bolt1代码:
package org.example.storm;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.tuple.Tuple;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
public class NextBolt1 extends BaseRichBolt {
private OutputCollector collector;
private long startTime;
@Override
public void prepare(Map<String, Object> map, TopologyContext topologyContext, OutputCollector outputCollector) {
this.collector = outputCollector;
this.startTime = System.currentTimeMillis();
}
@Override
public void execute(Tuple tuple) {
// 接收并处理传递过来的数据
Map<String, Integer> tradeVolumeMap = new HashMap<>((Map<String, Integer>) tuple.getValueByField("tradeVolumeMap"));
Map<String, Double> totalPriceMap = new HashMap<>((Map<String, Double>) tuple.getValueByField("totalPriceMap"));
Map<String, Long> startTimeMap = new HashMap<>((Map<String, Long>) tuple.getValueByField("startTimeMap"));
Map<String, Long> accumulateTimeMap = new HashMap<>((Map<String, Long>) tuple.getValueByField("accumulateTimeMap"));
// System.out.println(localTradeVolumeMap);
// System.out.println(localTotalPriceMap);
// System.out.println(localStartTimeMap);
// System.out.println(accumulateTimeMap);
for (Map.Entry<String, Integer> entry: tradeVolumeMap.entrySet()){
// 取得股票code
String stockName = entry.getKey();
Integer totalVolume = entry.getValue();
Double totalPrice = totalPriceMap.get(entry.getKey());
// System.out.printf("【%s】共计交易 %f 元 成交 %d 单 \n", stockName, totalPrice, totalVolume);
}
// System.out.println("----------------------------------");
System.out.printf("总耗时:%ds \n", (System.currentTimeMillis() - this.startTime) / 1000);
collector.ack(tuple);
}
@Override
public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
}
}
Next-bolt2代码:
package org.example.storm;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.tuple.Tuple;
import java.util.HashMap;
import java.util.Map;
public class NextBolt2 extends BaseRichBolt {
private Map<String, Integer> tradeVolumeMap;
private Map<String, Double> totalPriceMap;
private Map<String, Long> startTimeMap;
private Map<String, Long> accumulateTimeMap;
private OutputCollector collector;
private Map<String, Long> recordStartTimeMap;
private Map<String, Integer> recordTradeVolumeMap;
private Map<String, Double> recordTotalPriceMap;
private long startTime;
@Override
public void prepare(Map<String, Object> map, TopologyContext topologyContext, OutputCollector outputCollector) {
this.collector = outputCollector;
this.startTime = System.currentTimeMillis();
// 创建交易量记录map
tradeVolumeMap = new HashMap<>();
// 创建总金额记录map
totalPriceMap = new HashMap<>();
// 创建开始时间记录map
startTimeMap = new HashMap<>();
// 创建累积时间记录器
accumulateTimeMap = new HashMap<>();
// 创建当前bolt中的时间记录器
recordStartTimeMap = new HashMap<>();
// 创建当前bolt中的交易量记录器
recordTradeVolumeMap = new HashMap<>();
// 创建当前bolt中的总金额记录器
recordTotalPriceMap = new HashMap<>();
}
@Override
public void execute(Tuple tuple) {
// 接收并处理传递过来的数据
tradeVolumeMap = (Map<String, Integer>) tuple.getValueByField("tradeVolumeMap");
totalPriceMap = (Map<String, Double>) tuple.getValueByField("totalPriceMap");
startTimeMap = (Map<String, Long>) tuple.getValueByField("startTimeMap");
accumulateTimeMap = (Map<String, Long>) tuple.getValueByField("accumulateTimeMap");
// 创建拷贝
Map<String, Integer> tradeVolumeMapCopy = new HashMap<>(tradeVolumeMap);
Map<String, Double> totalPriceMapCopy = new HashMap<>(totalPriceMap);
Map<String, Long> startTimeMapCopy = new HashMap<>(startTimeMap);
Map<String, Long> accumulateTimeMapCopy = new HashMap<>(accumulateTimeMap);
for (Map.Entry<String, Integer> entry: tradeVolumeMapCopy.entrySet()) {
String key = entry.getKey();
if (recordTradeVolumeMap.containsKey(key)){
long accumulateTime = accumulateTimeMapCopy.get(key);
long recordStartTime = recordStartTimeMap.get(key);
long volume = tradeVolumeMapCopy.get(key)-recordTradeVolumeMap.get(key);
double price = totalPriceMapCopy.get(key)-recordTotalPriceMap.get(key);
// System.out.println(accumulateTime - recordStartTime);
if (accumulateTime - recordStartTime >= 3600000L){
// System.out.println("yes");
// System.out.printf("【%s】一小时成交 %d 单 金额为%f\n", key, volume, price);
// 更新bolt中的时间记录器
recordStartTimeMap.put(key, accumulateTime);
// 更新bolt中的交易量记录器
recordTradeVolumeMap.put(key, tradeVolumeMapCopy.get(key));
// 更新bolt中的总金额记录器
recordTotalPriceMap.put(key, totalPriceMapCopy.get(key));
}
}else {
// 初始化bolt中的时间记录器
recordStartTimeMap.put(key, startTimeMapCopy.get(key));
// 初始化bolt中的交易量记录器
recordTradeVolumeMap.put(key, tradeVolumeMapCopy.get(key));
// 初始化bolt中的总金额记录器
recordTotalPriceMap.put(key, totalPriceMapCopy.get(key));
}
}
// System.out.println("初始"+recordStartTimeMap);
// System.out.println("当前"+accumulateTimeMapCopy);
// System.out.println("---------------------------------");
System.out.printf("总耗时:%ds \n", (System.currentTimeMillis() - this.startTime) / 1000);
collector.ack(tuple);
}
@Override
public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
}
}
问题与解决方案
1、bolt协调问题
实现两个具体任务是由两个bolt完成的,如果我们同时运行两个bolt,有时候需要来查看结果,但是打印输出的时间消耗远远超过了数据的计算,在测试的时候,如果两个bolt打印输出的量不同,消耗的资源就不同,如果差异过大就会引发线程错误的问题,所以要控制两个bolt之间的资源分配。
2、本地集群与真实集群适配
目前上面的代码是在本地集群进行测试的,在打包后提交到真实集群出现了spout接受不到kafka消息的问题,此问题暂时还未解决,将在后期大作业中解决后更新。