spark读取数据的时候常见一个需求,需要读入文件所在路径,例如读入数据的时候需要分区路径(通常是日期)。
其实hive中就有内置函数可以直接读入文件路径 INPUT__FILE__NAME,只不过通常不太用,所以spark-sql里也有函数可以直接读取部分文件信息的。需要用到spark2.0.
//需要引入 import org.apache.spark.sql.functions
dataset.withColumn("path",functions.input_file_name())
import java.util.{Calendar, Date}
import com.aotain.iptv.util.DateFormat
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{SparkSession, functions}
import org.apache.spark.sql.types.{DataTypes, StructField, StructType}
object ReadFilePath {
def main(args: Array[String]): Unit = {
//读取Spark Application 的配置信息
val sparkConf = new SparkConf()
//设置SparkApplication名称
.setAppName("ModuleSpark Application")
.setMaster("local[2]")
val sc = SparkContext.getOrCreate(sparkConf)
val spark = SparkSession.builder.config(sparkConf).getOrCreate()
val hadoopConf = sc.hadoopConfiguration
//读入的数据结构
val tvContentSchema = StructType(Array(
StructField("column1", DataTypes.StringType),
StructField("column2", DataTypes.StringType),
StructField("column3", DataTypes.StringType),
StructField("column4", DataTypes.StringType),
StructField("column5", DataTypes.StringType),
StructField("column6", DataTypes.StringType)
))
val tvContentFilepath = "localtest/input/tvcontent/"
//倒推一周时长
val date = "20190612"
var dateList:List[String] = Nil
val timeDate:Date = new Date()
timeDate.setTime(DateFormat.dateToTimestampMs(date,"yyyyMMdd"))
var cal:Calendar = Calendar.getInstance()
cal.setTime(timeDate)
for(a <- 0 to 6){
cal.add(Calendar.DATE,0)
val dateStr = DateFormat.timemsToTimeStamp(cal.getTime.getTime,"yyyyMMdd")
dateList = (tvContentFilepath + dateStr) :: dateList
}
System.out.println("inputFilePath: " + dateList.mkString(","))
//inputFilePath: localtest/input/tvcontent/20190612,localtest/input/tvcontent/20190612,localtest/input/tvcontent/20190612,localtest/input/tvcontent/20190612,localtest/input/tvcontent/20190612,localtest/input/tvcontent/20190612,localtest/input/tvcontent/20190612
//读入数据的时候可以直接加schema参数 同样可以利用csv直接切分 可变长参数 list后加:_* 标识
val tvContentDS = spark.read.option("delimiter", "|").schema(tvContentSchema).csv(dateList:_*)
//在原有结构的基础上增加一列 文件所在路径
val tvContentNewDs = tvContentDS.withColumn("path",functions.input_file_name())
tvContentNewDs.show(10)
}
}
最后显示的效果是
+-------+-------+-------+-------+-------+-------+--------------------+
|column1|column2|column3|column4|column5|column6| path|
+-------+-------+-------+-------+-------+-------+--------------------+
| test1| test1| test1| test1| test1| test1|file:///C:/Users/...|
| test2| test2| test2| test2| test2| test2|file:///C:/Users/...|
| test3| test3| test3| test3| test3| test3|file:///C:/Users/...|
| test4| test4| test4| test4| test4| test4|file:///C:/Users/...|
| test5| test5| test5| test5| test5| test5|file:///C:/Users/...|
| test6| test6| test6| test6| test6| test6|file:///C:/Users/...|
| test7| test7| test7| test7| test7| test7|file:///C:/Users/...|
| test8| test8| test8| test8| test8| test8|file:///C:/Users/...|
| test9| test9| test9| test9| test9| test9|file:///C:/Users/...|
| test10| test10| test10| test10| test10| test10|file:///C:/Users/...|
+-------+-------+-------+-------+-------+-------+--------------------+