Spark读取csv文件
package nj.zb.kb11
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.DoubleType
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.{SparkConf, SparkContext}
object ReadCsvDemo {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("sparkreadcsv")
val sc = SparkContext.getOrCreate(conf)
val lines = sc.textFile("E:\\ideaProjects\\sparkstu\\in\\users.csv")
val lines1: RDD[Array[String]] = lines.filter(x=>x.startsWith("user_id")==false).map(x=>x.split(","))
val spark: SparkSession = SparkSession.builder().appName("ReadCsvSparkSession").master("local[*]").getOrCreate()
val df: DataFrame = spark.read.format("csv").option("header",true).load("in/users.csv")
df.printSchema()
df.show(10)
val df2: DataFrame = df.select("user_id","birthyear")
val df3: DataFrame = df2.withColumn("birthyear",df2("birthyear").cast(DoubleType))
df3.filter(x=>x.getDouble(1) <= 1995).show(10)
}
}
Spark读取Json文件
package nj.zb.kb11
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{DataFrame, DataFrameReader, SparkSession}
object ReadJsonDemo {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("sparkreadcsv")
val sc = SparkContext.getOrCreate(conf)
val spark: SparkSession = SparkSession.builder().appName("ReadCsvSparkSession").master("local[*]").getOrCreate()
val frame: DataFrame = spark.read.format("json").option("head",true).load("in/user.json")
frame.printSchema()
}
}