原数据
{"date":"20200512","from":"银川","requestTime":"14:12:38","sceneCode":"010035","seats":[{"flag":0,"oldPrice":0,"price":"70","seatFlag":false,"seatName":"二等座","seats":21},{"flag":0,"oldPrice":0,"price":"112","seatFlag":false,"seatName":"一等座","seats":21},{"flag":0,"oldPrice":0,"price":"70","seatFlag":false,"seatName":"无座","seats":0}],"to":"中卫南","train":"C9885"}
{"date":"20200512","from":"绵阳","requestTime":"14:12:38","sceneCode":"010035","seats":[{"flag":0,"oldPrice":0,"price":"45","seatFlag":false,"seatName":"二等座","seats":0},{"flag":0,"oldPrice":0,"price":"72","seatFlag":false,"seatName":"一等座","seats":1},{"flag":0,"oldPrice":0,"price":"136","seatFlag":false,"seatName":"商务座","seats":3}],"to":"成都东","train":"G2213","transfersItemId":["MR/20jKzva2LSsBW4vB4mA==","7ZS6+3RlosndMnaqZViuPw==","JcNWxx0wzzSLE8B61MSu3w=="]}
{"date":"20200512","from":"厦门北","requestTime":"14:12:38","sceneCode":"010035","seats":[{"flag":0,"oldPrice":0,"price":"90.5","seatFlag":false,"seatName":"二等座","seats":0},{"flag":0,"oldPrice":0,"price":"144","seatFlag":false,"seatName":"一等座","seats":1},{"flag":0,"oldPrice":0,"price":"90.5","seatFlag":false,"seatName":"无座","seats":0}],"to":"长汀南","train":"D3145","transfersItemId":["52BFWTIIG8aDHelPPFqbcQ==","x1DUvd5Jho8jzZZspzQF4w==","Gq/i5mH9pqvWYk3PqXWnug=="]}
{"date":"20200513","from":"百色","requestTime":"14:12:38","sceneCode":"010035","seats":[{"flag":0,"oldPrice":0,"price":"105","seatFlag":false,"seatName":"二等座","seats":21},{"flag":0,"oldPrice":0,"price":"270","seatFlag":false,"seatName":"动卧","seats":2},{"flag":0,"oldPrice":0,"price":"105","seatFlag":false,"seatName":"无座","seats":0}],"to":"普者黑","train":"D3811"}
Dataset<Row> tableData=spark.read().json("json/noticket_recommend.json");
tableData.printSchema();
tableData.show(100,false);
结果
root
|-- date: string (nullable = true)
|-- from: string (nullable = true)
|-- requestTime: string (nullable = true)
|-- sceneCode: string (nullable = true)
|-- seats: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- flag: long (nullable = true)
| | |-- oldPrice: long (nullable = true)
| | |-- price: string (nullable = true)
| | |-- seatFlag: boolean (nullable = true)
| | |-- seatName: string (nullable = true)
| | |-- seats: long (nullable = true)
|-- to: string (nullable = true)
|-- train: string (nullable = true)
|-- transfersItemId: array (nullable = true)
| |-- element: string (containsNull = true)
+--------+------+-----------+---------+---------------------------------------------------------------------------------------------+------+-----+------------------------------------------------------------------------------+
|date |from |requestTime|sceneCode|seats |to |train|transfersItemId |
+--------+------+-----------+---------+---------------------------------------------------------------------------------------------+------+-----+------------------------------------------------------------------------------+
|20200512|银川 |14:12:38 |010035 |[[0, 0, 70, false, 二等座, 21], [0, 0, 112, false, 一等座, 21], [0, 0, 70, false, 无座, 0]] |中卫南|C9885|null |
|20200512|绵阳 |14:12:38 |010035 |[[0, 0, 45, false, 二等座, 0], [0, 0, 72, false, 一等座, 1], [0, 0, 136, false, 商务座, 3]] |成都东|G2213|[MR/20jKzva2LSsBW4vB4mA==, 7ZS6+3RlosndMnaqZViuPw==, JcNWxx0wzzSLE8B61MSu3w==]|
|20200512|厦门北|14:12:38 |010035 |[[0, 0, 90.5, false, 二等座, 0], [0, 0, 144, false, 一等座, 1], [0, 0, 90.5, false, 无座, 0]]|长汀南|D3145|[52BFWTIIG8aDHelPPFqbcQ==, x1DUvd5Jho8jzZZspzQF4w==, Gq/i5mH9pqvWYk3PqXWnug==]|
|20200513|百色 |14:12:38 |010035 |[[0, 0, 105, false, 二等座, 21], [0, 0, 270, false, 动卧, 2], [0, 0, 105, false, 无座, 0]] |普者黑|D3811|null |
+--------+------+-----------+---------+---------------------------------------------------------------------------------------------+------+-----+------------------------------------------------------------------------------+
现在我只想取array<struct>
里面的price,seatFlat,seatName,seats字段插入hive表
查了下文档,利用arrays_zip(注意spark2.4版本以上)直接上代码
Dataset<Row> mergeData=tableData.withColumn("seat_info",arrays_zip(col("seats.price"),col("seats.seatFlag"),col("seats.seatName")
,col("seats.seats")).cast("array<struct<price:string,seat_flag:string,seat_name:string,seat_num:long>>")).drop("seats");
mergeData.printSchema();
mergeData.show(100,false);
root
|-- date: string (nullable = true)
|-- from: string (nullable = true)
|-- requestTime: string (nullable = true)
|-- sceneCode: string (nullable = true)
|-- to: string (nullable = true)
|-- train: string (nullable = true)
|-- transfersItemId: array (nullable = true)
| |-- element: string (containsNull = true)
|-- seat_info: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- price: string (nullable = true)
| | |-- seat_flag: string (nullable = true)
| | |-- seat_name: string (nullable = true)
| | |-- seat_num: long (nullable = true)
+--------+------+-----------+---------+------+-----+------------------------------------------------------------------------------+---------------------------------------------------------------------------+
|date |from |requestTime|sceneCode|to |train|transfersItemId |seat_info |
+--------+------+-----------+---------+------+-----+------------------------------------------------------------------------------+---------------------------------------------------------------------------+
|20200512|银川 |14:12:38 |010035 |中卫南|C9885|null |[[70, false, 二等座, 21], [112, false, 一等座, 21], [70, false, 无座, 0]] |
|20200512|绵阳 |14:12:38 |010035 |成都东|G2213|[MR/20jKzva2LSsBW4vB4mA==, 7ZS6+3RlosndMnaqZViuPw==, JcNWxx0wzzSLE8B61MSu3w==]|[[45, false, 二等座, 0], [72, false, 一等座, 1], [136, false, 商务座, 3]] |
|20200512|厦门北|14:12:38 |010035 |长汀南|D3145|[52BFWTIIG8aDHelPPFqbcQ==, x1DUvd5Jho8jzZZspzQF4w==, Gq/i5mH9pqvWYk3PqXWnug==]|[[90.5, false, 二等座, 0], [144, false, 一等座, 1], [90.5, false, 无座, 0]]|
|20200513|百色 |14:12:38 |010035 |普者黑|D3811|null |[[105, false, 二等座, 21], [270, false, 动卧, 2], [105, false, 无座, 0]] |
+--------+------+-----------+---------+------+-----+------------------------------------------------------------------------------+---------------------------------------------------------------------------+
下一步:插入hive表。
参考文档
https://stackoverflow.com/questions/36476358/how-to-select-a-subset-of-fields-from-an-array-column-in-spark