spark 自定义RDD,通过j2ssh远程读取文件

如何自定义RDD

1.引用依赖

<dependencies>
        <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-core -->
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.11</artifactId>
            <version>${spark.version}</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-sql -->
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_2.11</artifactId>
            <version>${spark.version}</version>
        </dependency>

        <dependency>
            <groupId>net.sf.sshapi</groupId>
            <artifactId>sshapi-j2ssh</artifactId>
            <version>1.0.0</version>
        </dependency>



    </dependencies>

2.编写代码继承RDD,重写compute和getPartitions方法,如果要了解其原理就必须要了解RDD的五大特性(百度上可以搜索到,也可以到RDD的源码翻译得到),根据官方的RDD的实现类模仿实现



import java.io.{BufferedReader, InputStreamReader}

import com.sshtools.j2ssh.SshClient
import com.sshtools.j2ssh.authentication.{AuthenticationProtocolState, PasswordAuthenticationClient}
import com.sshtools.j2ssh.sftp.SftpFileInputStream
import com.sshtools.j2ssh.transport.IgnoreHostKeyVerification
import org.apache.spark.{Partition, SparkContext, TaskContext}
import org.apache.spark.rdd.RDD


case class IntergerPartition3(index: Int,path:String) extends Partition

class OnlineRDD(sc: SparkContext, ip: String, port: Int, username: String, password: String, paths: Array[String]) extends RDD[String](sc, Nil) {

  private final val DEFULT_SPLIT_SIZE = 209715200

  override def compute(split: Partition, context: TaskContext): Iterator[String] = new NextIterator[String] {
    val path = split.asInstanceOf[IntergerPartition3].path
    context.addTaskCompletionListener { context => closeIfNeeded() }
    val sshClient = new SshClient
    sshClient.connect(ip, port, new IgnoreHostKeyVerification)
    val pwd = new PasswordAuthenticationClient
    pwd.setUsername(username)
    pwd.setPassword(password)
    val result = sshClient.authenticate(pwd)
    var bufferedReader: BufferedReader = null;
    if (result != AuthenticationProtocolState.COMPLETE) {
      throw new RuntimeException("连接服务器异常")
    }
    val sftpClient = sshClient.openSftpChannel()
    val sftpFile = sftpClient.openFile(path, 1)
    val sftpFileInputStream = new SftpFileInputStream(sftpFile)
    bufferedReader = new BufferedReader(new InputStreamReader(sftpFileInputStream))


    /**
     * Method for subclasses to implement to provide the next element.
     *
     * If no next element is available, the subclass should set `finished`
     * to `true` and may return any value (it will be ignored).
     *
     * This convention is required because `null` may be a valid value,
     * and using `Option` seems like it might create unnecessary Some/None
     * instances, given some iterators might be called in a tight loop.
     *
     * @return U, or set 'finished' when done
     */
    override protected def getNext(): String = {
      val line = bufferedReader.readLine()
      if (line != null) return line
      finished = true
      null.asInstanceOf[String]
    }

    /**
     * Method for subclasses to implement when all elements have been successfully
     * iterated, and the iteration is done.
     *
     * <b>Note:</b> `NextIterator` cannot guarantee that `close` will be
     * called because it has no control over what happens when an exception
     * happens in the user code that is calling hasNext/next.
     *
     * Ideally you should have another try/catch, as in HadoopRDD, that
     * ensures any resources are closed should iteration fail.
     */
    override protected def close(): Unit = {
      bufferedReader.close()
      //      sftpClient.closeFile(sftpFile)
      sftpClient.close()
      sshClient.disconnect()
    }
  }

  override protected def getPartitions: Array[Partition] = {
    val arrays = new Array[Partition](paths.length)
    var i = 0
    paths.foreach(path=>{
      arrays(i) = new IntergerPartition3(i,path)
      i += 1
    })
    arrays
  }
}

3.调用

通过new OnlineRDD创建就可以和之前的一样使用了

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值