spark篇3:spark操作ftp
废话不多说,直接上干货
package com.iflytek.ftp
import java.text.SimpleDateFormat
import java.util._
import com.alibaba.fastjson.JSON
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
object spark2ftp {
def main(args: Array[String]): Unit = {
val sparkSession = SparkSession.builder()
.master("local")
.appName("appName")
.config("spark.testing.memory","471859200")
.getOrCreate()
sparkSession.sql(s"use carbondata");
val rq= new SimpleDateFormat("yyyy-MM-dd").format(new Date().getTime)
val dataSource = "ftp://账号:密码@ip:端口/目录/"
//从ftp整个目录下读取数据
val ftpInput: RDD[(String, String)] = sparkSession.sparkContext.wholeTextFiles(dataSource)
val value: RDD[String] = ftpInput.map(_._2)
val xq_sb: RDD[(String, String)] = value.map(json => {
val nObject = JSON.parseObject(json)
val bodyObject1 = nObject.getJSONObject("body")
val bodyObject2 = bodyObject1.getJSONObject("body")
val sbmc = bodyObject2.get("id").toString
val xqbm = bodyObject2.get("name").toString
(sbmc, xqbm)
})
val ds: Dataset[(String, String)] = sparkSession.createDataset(xq_sb)
val frame: DataFrame = ds.toDF("sbmc","xqbm")
frame.filter(s"sbmc like '%名称%'").where("1=1 and 2=2").limit(2).select("sbmc").show()
}
}