加载操作文件
csv格式
object SelectCSV {
def main(args: Array[String]): Unit = {
//获取SparkSession实例
val conf = new SparkConf().setMaster("local").setAppName("select")
val spark = SparkSession.builder().config(conf).getOrCreate()
//加载cav格式文件
val df:DataFrame = spark.read.format("csv").option("header","true").load("in/users.csv")
//显示文件数据结构
df.printSchema()
/*
root
|-- user_id: string (nullable = true)
|-- locale: string (nullable = true)
|-- birthyear: string (nullable = true)
|-- gender: string (nullable = true)
|-- joinedAt: string (nullable = true)
|-- location: string (nullable = true)
|-- timezone: string (nullable = true)
*/
//显示列名为user_id与locale的数据
df.select("user_id","locale").show()
/*
+----------+------+
| user_id|locale|
+----------+------+
|3197468391| id_ID|
|3537982273| id_ID|
| 823183725| en_US|
| 184647001| id_ID|
|1013376584| id_ID|
|2686249984| en_US|
+----------+------+
*/
}
}
Json格式
object SelectJSON {
def main(args: Array[String]): Unit = {
//护球SparkSession实例
val conf: SparkConf = new SparkConf().setAppName("csv").setMaster("local")
val session = SparkSession.builder().config(conf).getOrCreate()
//加载Json格式文件
val df = session.read.format("json").option("header","true").load("in/users.json")
//显示文件数据结构
df.printSchema()
/*
root
|-- Age: long (nullable = true)
|-- name: string (nullable = true)
*/
//显示json文件数据
df.select("Age","name").show()
/*
+----+-------+
| Age| name|
+----+-------+
|null|Michael|
| 30| Andy|
| 19| Justin|
+----+-------+
*/