//第一步 java向指定目录写文件 ,打jar包集群执行脚本 java -jar 参数1 参数2
package toFlume;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
public class Test{
public static void main(String[] args) throws IOException, InterruptedException {
BufferedReader br = null;
BufferedWriter bw = null;
try {
br = new BufferedReader(new FileReader(File.separator + "home" + File.separator + + "Desktop" + File.separator + "acc" + File.separator + args[0]));
bw = new BufferedWriter(new FileWriter(File.separator + "home" + File.separator + "Desktop" + File.separator + "acc" + File.separator + args[1]));
String line;
while((line = br.readLine()) != null) {
bw.write(line + "\n");
Thread.sleep(30);
}
} catch (FileNotFoundException e) {
// TODO Auto-generated catch blocks
e.printStackTrace();
}finally{
br.flush();//刷-
bw.flush()//
br.close();
bw.close();
}
}
}
//第二步 flume 从java指定路径写的文件读取数据 、sink配置kafka类型
//启动flume程序bin/flume-ng agent -c conf -f flume-kafka-sparkstreaming.conf -name a1 -Dflume.root.logger=INFO,console
//以下为flume配置
a1.sources = s1
a1.channels = c1
a1.sinks = k1
a1.sources.s1.type=exec
a1.sources.s1.command=tail -F /home/hadoop/Desktop/acc/111.txt
a1.channels.c1.type=memory
a1.channels.c1.capacity=10000
a1.channels.c1.transactionCapacity=100
#设置Kafka接收器
a1.sinks.k1.type= org.apache.flume.sink.kafka.KafkaSink
#设置Kafka的broker地址和端口号
a1.sinks.k1.brokerList=master:9092,slave1:9092,slave2:9092
#设置Kafka的Topic
a1.sinks.k1.topic=flume-kafka-sparkstreaming
#设置序列化方式
a1.sinks.k1.serializer.class=kafka.serializer.StringEncoder
#设置kafka的ack机制
a1.sinks.k1.requiredAcks = 1
a1.sources.s1.channels=c1
a1.sinks.k1.channel=c1
//第三步kafka创建topic
kafka-topics.sh --create --bootstrap-server master:9092slave1:9092,slave2:9092 --replication-factor 3 --partitions 3 --topic flume-kafka-sparkstreaming
/*第四步 kafka->ssc(StreamingContext)
*/
package sparkStreamingTest
import java.util._
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.log4j.{Level, Logger}
import org.apache.spark.{HashPartitioner, SparkConf, SparkContext}
import org.apache.spark.streaming.kafka010._
import sparkStreamingTest.CostomTeset.updateFunc
import org.apache.spark.streaming.{Seconds, StreamingContext}
object SparkStreamingKafka {
def main(args: Array[String]): Unit = {
//日志级别
Logger.getLogger("org").setLevel(Level.WARN)
val conf = new SparkConf().setMaster("local[2]").setAppName("streaming-kafka")//.setMaster("local[2]")
//sc对象
val sc = new SparkContext(conf)
//ssc,两秒刷新输出
val ssc = new StreamingContext(sc,Seconds(2))
//检查点
ssc.checkpoint("E;/checkPoint1")
//kafka集群信息
val kafkaParams = scala.Predef.Map[String,Object](
"bootstrap.servers"->"master:9092,slave1:9092,slave2:9092",//节点配置
"key.deserializer"->classOf[StringDeserializer],
"value.deserializer"->classOf[StringDeserializer],
"group.id"->"use_a_separate_group_id_for_each_stream",
"auto.offset.reset"->"latest",
"enable.auto.commit"->(false:java.lang.Boolean)
)
//关联kafka主题
val topics1 = Array("flume-kafka-sparkstreaming")
//通过主题和kafka参数创建stream
val stream = KafkaUtils.createDirectStream(
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies
.Subscribe[String,String](topics1, kafkaParams))//java.util.Map[TopicPartition, Long]))
//原key值为null、空格切分数据、.map((_,1))每个key的value为1,updateStateByKey通过键值进行归约,
val result = stream.map(x=>(x.key(),x.value())).map(_._2.split(" ")(0))
.map((_,1)).updateStateByKey(updateFunc, new HashPartitioner(ssc.sparkContext.defaultParallelism), true)
.transform(x=>x.sortBy(_._2,false))
// val result = stream.map(x=>x.key())
//惰性操作,Dstream执行
result.print()
//开始计算
ssc.start()
//终止计算
ssc.awaitTermination()
}
}