在企业大数据项目中,我们经常会遇到这样的需求:
给一个单行json,里面包含多个同级别的小json,需要得到所有的小json。
比如,大json为:
{"Records":[{"name":"a"},{"name":"b"},{"name":"c"},{"name":"d"}]}
需要得到json数组:
{"name":"a"}
{"name":"b"}
{"name":"c"}
{"name":"d"}
解决方法:
pom文件中添加阿里巴巴的FastJson依赖:
<!-- 阿里fastjson包JSON转换依赖-->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.62</version>
</dependency>
当然,Spark的相关依赖咱得有哈。
创建测试json文件(单行json):
{"Records":[{"eventVersion":"1.05","userIdentity":{"type":"IAMUser","principalId":"AIDAIKWK2VHCFYOMMLSTU","arn":"arn:aws:iam::439370927188:user/bigdata","accountId":"439370927188","accessKeyId":"AKIAWMTEMJRKD7BKSKDX","userName":"bigdata"},"eventTime":"2020-01-02T23:47:49Z","eventSource":"s3.amazonaws.com","eventName":"PutObject","awsRegion":"eu-west-1","sourceIPAddress":"10.20.9.188","userAgent":"[JetS3t/0.9.4 (Linux/3.10.0-957.1.3.el7.x86_64; amd64; en; JVM 1.8.0_121)]","requestParameters":{"bucketName":"temp-test","Host":"temp-test.s3.amazonaws.com:443","key":"online/test_10280002/20200103/07/test_10280002_202001030730.1578007803517"},"responseElements":null,"additionalEventData":{"SignatureVersion":"SigV2","CipherSuite":"ECDHE-RSA-AES128-SHA","bytesTransferredIn":3.0025087E7,"AuthenticationMethod":"AuthHeader","x-amz-id-2":"+bJ+f6caCOogg78znb0uyZytg7O6HTHx00NDTmv6Vk9ctv5TJu8wJ/C1UohjRjao0M4nyZpq68o=","bytesTransferredOut":0.0},"requestID":"556DF42E172942F7","eventID":"8d320d48-ede0-41f4-96bd-ad7671c95f0f","readOnly":false,"resources":[{"type":"AWS::S3::Object","ARN":"arn:aws:s3:::temp-test/online/test_10280002/20200103/07/test_10280002_202001030730.1578007803517"},{"accountId":"439370927188","type":"AWS::S3::Bucket","ARN":"arn:aws:s3:::temp-test"}],"eventType":"AwsApiCall","recipientAccountId":"439370927188","vpcEndpointId":"vpce-0ccdb9c290472258e"},{"eventVersion":"1.05","userIdentity":{"type":"IAMUser","principalId":"AIDAIKWK2VHCFYOMMLSTU","arn":"arn:aws:iam::439370927188:user/bigdata","accountId":"439370927188","accessKeyId":"AKIAWMTEMJRKD7BKSKDX","userName":"bigdata"},"eventTime":"2020-01-02T23:47:49Z","eventSource":"s3.amazonaws.com","eventName":"PutObject","awsRegion":"eu-west-1","sourceIPAddress":"10.20.9.188","userAgent":"[JetS3t/0.9.4 (Linux/3.10.0-957.1.3.el7.x86_64; amd64; en; JVM 1.8.0_121)]","requestParameters":{"bucketName":"temp-test","Host":"temp-test.s3.amazonaws.com:443","key":"online/test_10280002/20200103/07/test_10280002_202001030730.1578007803517"},"responseElements":null,"additionalEventData":{"SignatureVersion":"SigV2","CipherSuite":"ECDHE-RSA-AES128-SHA","bytesTransferredIn":3.0025087E7,"AuthenticationMethod":"AuthHeader","x-amz-id-2":"+bJ+f6caCOogg78znb0uyZytg7O6HTHx00NDTmv6Vk9ctv5TJu8wJ/C1UohjRjao0M4nyZpq68o=","bytesTransferredOut":0.0},"requestID":"556DF42E172942F7","eventID":"14324234512-we1-4321-we13-523ewer1","readOnly":false,"resources":[{"type":"AWS::S3::Object","ARN":"arn:aws:s3:::temp-test/online/test_10280002/20200103/07/test_10280002_202001030730.1578007803517"},{"accountId":"439370927188","type":"AWS::S3::Bucket","ARN":"arn:aws:s3:::temp-test"}],"eventType":"AwsApiCall","recipientAccountId":"439370927188","vpcEndpointId":"vpce-0ccdb9c290472258e"}]}
scala代码:
package sparkSql
import com.alibaba.fastjson.{JSON, JSONObject}
import org.apache.spark.sql.SparkSession
/**
* @author cherry
* @create 2020-01-31-20:21
*
* vm参数:-Dspark.master=local
*/
object TempTest {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder().appName("GetJsonArray")
.getOrCreate()
//将第一层rdd拆分成多个json串的rdd
spark.sparkContext
.textFile(args(0))
//将单个json展开成多个子json,传入大json的key
.flatMap(JSON.parseObject(_, classOf[JSONObject]).getJSONArray("Records").toArray)
.foreach(println)
//释放资源
spark.stop()
}
}
运行程序前的json
运行程序,查看控制台:
可以看到json已经被展开成多个子json。
为了方便起见,我们还可以将FlatMap算子中的代码转换成一个函数,代码如下:
/**
* 获取json串中的数组,并以对象数组的形式返回。
*
* @param json 被提取的json串
* @param head 被解析的key
*/
def getJsonArray(json: String, head: String): Array[AnyRef] = {
// 解析json中的数组
JSON.parseObject(json, classOf[JSONObject]).getJSONArray(head).toArray
}