1.JavaFlumeEventCount
package cn.gitv.bi.log.analysis.spark;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.api.java.*;
import org.apache.spark.streaming.flume.FlumeUtils;
import org.apache.spark.streaming.flume.SparkFlumeEvent;
/**
* Flume Spark Event计数器
*
* @author puppy
* @date 2015-04-12
*/
public final class JavaFlumeEventCount {
private JavaFlumeEventCount() {
}
@SuppressWarnings("serial")
public static void main(String[] args) {
if (args.length != 2) {
System.err.println("Usage: JavaFlumeEventCount <host> <port>");
System.exit(1);
}
String host = args[0];
int port = Integer.parseInt(args[1]);
Duration batchInterval = new Duration(2000);
SparkConf sparkConf = new SparkConf().setAppName("JavaFlumeEventCount");
JavaStreamingContext ssc = new JavaStreamingContext(sparkConf,
batchInterval);
JavaReceiverInputDStream<SparkFlumeEvent> flumeStream = FlumeUtils
.createStream(ssc, host, port);
flumeStream.count();
flumeStream.count().map(new Function<Long, String>() {
@Override
public String call(Long in) {
return "Received " + in + " flume events.";
}
}).print();
ssc.start();
ssc.awaitTermination();
}
}
2.将上面代码打成jar。
bi-data-analysis-spark.jar(名字自己随意定)
3.启动
启动时需注意,如果采用 –master spark://slave3.hadoop.gitv.we:7077 即在集群运行,你绑定的主机名必须写成 0.0.0.0,端口号自己定义 ,等spark 程序启动后,查看在哪个节点上启动了接收flume流数据的端口(查看端口命令 netstat -anp | grep 12345),在将该主机和端口配置到flume的配置文件中,启动flume,就可以实现数据得传输了。如果采用yarn-cluster 也是一样。如果本地启动就不用由此限制了
另外注意依赖的jar包 –jars 标示,多个jar 用,分割
spark-submit --class cn.gitv.bi.log.analysis.spark.JavaFlumeEventCount \
--master spark://slave3.hadoop.gitv.we:7077 \
--num-executors 3 \
--driver-memory 1 \
--executor-memory 1g
--executor-cores 1
--queue defalut
--jars ./spark-streaming_2.10-1.2.0.jar,spark-streaming-flume_2.10-1.2.0.jar,flume-ng-sdk-1.4.0.jar,avro-ipc-1.7.4.jar
./bi-data-analysis-spark.jar 0.0.0.0 12345
4.启动flume
注意要先找到spark哪个worker在监听avro的source,我这里是10.10.121.56 (查看命令 netstat -anp | grep 12345)
启动命令
./bin/flume-ng agent -n agent1 -c ./conf -f ./conf/vod-access-avro-collect -Dflume.root.logger=INFO,console
配置文件 vod-access-avro-collect 内容
agent1.sources = source1
agent1.sinks = sink1
agent1.channels = channel1
# Describe/configure tail -F source1
agent1.sources.source1.type = exec
agent1.sources.source1.command = tail -F /data/pingback/logs/vod_pb-access.log
#agent1.sources.source1.command = tail -n +0 -F /home/panchao/testFlume/test.log
agent1.sources.source1.channels = channel1
#configure host for source
agent1.sources.source1.interceptors = i1
agent1.sources.source1.interceptors.i1.type = host
agent1.sources.source1.interceptors.i1.hostHeader = hostname
# Describe avro sink1
agent1.sinks.sink1.type = avro
agent1.sinks.sink1.hostname = 10.10.121.56
agent1.sinks.sink1.port = 12345
agent1.sinks.sink1.batch-size = 10000
agent1.channels.channel1.type = file
agent1.channels.channel1.checkpointDir = /data1/bi/flume/vod_checkpoint
agent1.channels.channel1.useDualCheckpoints = true
agent1.channels.channel1.backupCheckpointDir = /data1/bi/flume/vod_backupcheckpoint
agent1.channels.channel1.dataDirs = /data1/bi/flume/vod_data
agent1.sources.source1.channels = channel1
agent1.sinks.sink1.channel = channel1