DStream输入
-
基本数据源
-
文件数据源
创建方式:
StreamingContext.fileStream[KeyClass, ValueClass, InputFormatClass]
文本文件:
streamingContext.textFileStream(dataDirectory)
pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.fengling</groupId>
<artifactId>streaming</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<spark.version>2.1.1</spark.version>
<scala.version>2.11.12</scala.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
<scope>provided</scope>
</dependency>
</dependencies>
<build>
<plugins>
<!--添加编译支持,都编译成java1.8版本-->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.6.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<!--添加Scala编译支持-->
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.2.2</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>3.0.0</version>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
<configuration>
<archive>
<manifest>
<mainClass>com.fengling.TextFileWordCount</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
file word count: 读取HDFS上的/data目录
package com.fengling
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
object TextFileWordCount extends App {
// 创建StreamingContext
val conf = new SparkConf().setMaster("local[4]").setAppName("TextFileWordCountV0.1")
val ssc = new StreamingContext(conf, Seconds(1))
val lines = ssc.textFileStream("hdfs://hadoop129:9000/data")
val words = lines.flatMap(_.split(" "))
val wordCount = words.map(word => (word ,1 )).reduceByKey(_+_)
wordCount.print()
ssc.start()
ssc.awaitTermination()
}
打包:
上传到虚拟机中并spark-submit测试:
上传文件到/data目录:
-
自定义数据源
通过继承抽象类Receiver,并实现onStart()、onStop()方法来自定义数据源
package com.fengling
import java.io.{BufferedReader, InputStreamReader}
import java.net.Socket
import java.nio.charset.StandardCharsets
import org.apache.spark.SparkConf
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.receiver.Receiver
/**
* @author fengchengliang@126.com
* @date 2019-10-29
*/
class CustomReceive(host: String, port: Int) extends Receiver[String](StorageLevel.MEMORY_AND_DISK_2) {
override def onStart(): Unit = {
new Thread("Socket Receiver") {
override def run(): Unit = {
receive()
}
}.start()
}
/** Create a socket connection and receive data until receiver is stopped */
private def receive() {
var socket: Socket = null
var userInput: String = null
try {
// Connect to host:port
socket = new Socket(host, port)
// Until stopped or connection broken continue reading
val reader = new BufferedReader(
new InputStreamReader(socket.getInputStream(), StandardCharsets.UTF_8))
userInput = reader.readLine()
while (!isStopped && userInput != null) {
store(userInput)
userInput = reader.readLine()
}
reader.close()
socket.close()
// Restart in an attempt to connect again when server is active again
restart("Trying to connect again")
} catch {
case e: java.net.ConnectException =>
// restart if could not connect to server
restart("Error connecting to " + host + ":" + port, e)
case t: Throwable =>
// restart if there is any other error
restart("Error receiving data", t)
}
}
override def onStop(): Unit = {}
}
object CustomReceiver {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("CustomReceiver")
val ssc = new StreamingContext(sparkConf, Seconds(1))
val customReceiverStream = ssc.receiverStream(new CustomReceive("hadoop129", 9999))
val words = customReceiverStream.flatMap(_.split(" "))
// 转换成一个二元组,统计每个单词出现的次数
val wordCount = words.map(word => (word, 1)).reduceByKey(_ + _)
wordCount.print()
ssc.start()
ssc.awaitTermination()
}
}
测试:
[fengling@hadoop129 spark-2.4.4-bin-hadoop2.7]$ bin/spark-submit --class com.fengling.CustomReceiver ~/streaming-1.0-SNAPSHOT-jar-with-dependencies.jar