我可以给你一个解决方案,虽然我不知道你的用例是什么,因为如果你想提取id作为区分json文件的关键,我想你可以只过滤数据帧上的id列 . 无论如何以下方法可以满足您的要求:
测试json文件:test.json
{"id": "1","data": "data1","image": "image1"}
{"id": "2","data": "data2","image": "image2"}
scala代码(withColumn udf json4s):
import org.json4s.{DefaultFormats, MappingException}
import org.json4s.jackson.JsonMethods._
import org.apache.spark.sql.functions._
def getJsonKey(jsonstring: String): (String) = {
implicit val formats = DefaultFormats
val parsedJson = parse(jsonstring)
val key = (parsedJson \ "id").extract[String]
key
}
val getJsonKeyUDF = udf((jsonstring: String) => getJsonKey(jsonstring))
val df = spark.read.format("text