如何自定义RDD
1.引用依赖
<dependencies>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-core -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-sql -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>net.sf.sshapi</groupId>
<artifactId>sshapi-j2ssh</artifactId>
<version>1.0.0</version>
</dependency>
</dependencies>
2.编写代码继承RDD,重写compute和getPartitions方法,如果要了解其原理就必须要了解RDD的五大特性(百度上可以搜索到,也可以到RDD的源码翻译得到),根据官方的RDD的实现类模仿实现
import java.io.{BufferedReader, InputStreamReader}
import com.sshtools.j2ssh.SshClient
import com.sshtools.j2ssh.authentication.{AuthenticationProtocolState, PasswordAuthenticationClient}
import com.sshtools.j2ssh.sftp.SftpFileInputStream
import com.sshtools.j2ssh.transport.IgnoreHostKeyVerification
import org.apache.spark.{Partition, SparkContext, TaskContext}
import org.apache.spark.rdd.RDD
case class IntergerPartition3(index: Int,path:String) extends Partition
class OnlineRDD(sc: SparkContext, ip: String, port: Int, username: String, password: String, paths: Array[String]) extends RDD[String](sc, Nil) {
private final val DEFULT_SPLIT_SIZE = 209715200
override def compute(split: Partition, context: TaskContext): Iterator[String] = new NextIterator[String] {
val path = split.asInstanceOf[IntergerPartition3].path
context.addTaskCompletionListener { context => closeIfNeeded() }
val sshClient = new SshClient
sshClient.connect(ip, port, new IgnoreHostKeyVerification)
val pwd = new PasswordAuthenticationClient
pwd.setUsername(username)
pwd.setPassword(password)
val result = sshClient.authenticate(pwd)
var bufferedReader: BufferedReader = null;
if (result != AuthenticationProtocolState.COMPLETE) {
throw new RuntimeException("连接服务器异常")
}
val sftpClient = sshClient.openSftpChannel()
val sftpFile = sftpClient.openFile(path, 1)
val sftpFileInputStream = new SftpFileInputStream(sftpFile)
bufferedReader = new BufferedReader(new InputStreamReader(sftpFileInputStream))
/**
* Method for subclasses to implement to provide the next element.
*
* If no next element is available, the subclass should set `finished`
* to `true` and may return any value (it will be ignored).
*
* This convention is required because `null` may be a valid value,
* and using `Option` seems like it might create unnecessary Some/None
* instances, given some iterators might be called in a tight loop.
*
* @return U, or set 'finished' when done
*/
override protected def getNext(): String = {
val line = bufferedReader.readLine()
if (line != null) return line
finished = true
null.asInstanceOf[String]
}
/**
* Method for subclasses to implement when all elements have been successfully
* iterated, and the iteration is done.
*
* <b>Note:</b> `NextIterator` cannot guarantee that `close` will be
* called because it has no control over what happens when an exception
* happens in the user code that is calling hasNext/next.
*
* Ideally you should have another try/catch, as in HadoopRDD, that
* ensures any resources are closed should iteration fail.
*/
override protected def close(): Unit = {
bufferedReader.close()
// sftpClient.closeFile(sftpFile)
sftpClient.close()
sshClient.disconnect()
}
}
override protected def getPartitions: Array[Partition] = {
val arrays = new Array[Partition](paths.length)
var i = 0
paths.foreach(path=>{
arrays(i) = new IntergerPartition3(i,path)
i += 1
})
arrays
}
}
3.调用
通过new OnlineRDD创建就可以和之前的一样使用了