spark可以自动csv文件判断字符类型
原文件内容如下:
parent_order_no,member_id,union_id,create_time,create_date
TW168E93658FBHBSATPTX9866,33459866,ohmdTt1gcNVNORpm_onak1nOTduE,2019-02-14 07:34:39,2019-03-07
TW169383A50F2RPXEQ8TT4073,34224073,ohmdTt9ajPpDDyGddYy0HMZtDH-s,2019-03-01 15:48:59,2019-03-07
TW168E443C4E4VU1RCLS46953,34276953,ohmdTt6OsysaxJ9KVN74XASXMZ1A,2019-02-13 08:31:13,2019-03-07
TW168D6B6B869AYWAUTC50242,34680242,ohmdTtw319HkEVoWiga_D5NjoSCw,2019-02-10 17:22:05,2019-03-07
TW1690B671763YFR3VNRJ6007,34756007,ohmdTt5lHZmH2fhc6gcOEkEqJezw,2019-02-20 22:54:59,2019-03-07
TW1693D587B38ZKJOBIAQ6699,34756699,ohmdTtwH4TD1oWhmdTBYdLhIFdzA,2019-03-02 15:40:02,2019-03-07
TW168DAD663EAKWGTBVXW1762,34821762,ohmdTt3iPO3HzUqbLAp9el9bomFY,2019-02-11 12:35:09,2019-03-07
TW169090283839NTOOS5L6659,35026659,ohmdTtyh0lC5GvDLNMA08VlvHDCI,2019-02-20 11:45:53,2019-03-07
具体代码如下:
Dataset<Row> tableData = spark.read()
.option("inferschema", "true")
.option("header", "true")
.option("encoding", "gbk")
.csv("D:\\data\\csv\\stats_order_has_gone.csv");
tableData.show(false);
tableData.printSchema();
发现spark自动做了类型转换,结果如下
|parent_order_no |member_id|union_id |create_time |create_date |
+-------------------------+---------+----------------------------+---------------------+---------------------+
|TW168E93658FBHBSATPTX9866|33459866 |ohmdTt1gcNVNORpm_onak1nOTduE|2019-02-14 07:34:39.0|2019-03-07 00:00:00.0|
|TW169383A50F2RPXEQ8TT4073|34224073 |ohmdTt9ajPpDDyGddYy0HMZtDH-s|2019-03-01 15:48:59.0|2019-03-07 00:00:00.0|
|TW168E443C4E4VU1RCLS46953|34276953 |ohmdTt6OsysaxJ9KVN74XASXMZ1A|2019-02-13 08:31:13.0|2019-03-07 00:00:00.0|
|TW168D6B6B869AYWAUTC50242|34680242 |ohmdTtw319HkEVoWiga_D5NjoSCw|2019-02-10 17:22:05.0|2019-03-07 00:00:00.0|
|TW1690B671763YFR3VNRJ6007|34756007 |ohmdTt5lHZmH2fhc6gcOEkEqJezw|2019-02-20 22:54:59.0|2019-03-07 00:00:00.0|
|TW1693D587B38ZKJOBIAQ6699|34756699 |ohmdTtwH4TD1oWhmdTBYdLhIFdzA|2019-03-02 15:40:02.0|2019-03-07 00:00:00.0|
|TW168DAD663EAKWGTBVXW1762|34821762 |ohmdTt3iPO3HzUqbLAp9el9bomFY|2019-02-11 12:35:09.0|2019-03-07 00:00:00.0|
|TW169090283839NTOOS5L6659|35026659 |ohmdTtyh0lC5GvDLNMA08VlvHDCI|2019-02-20 11:45:53.0|2019-03-07 00:00:00.0|
|TW1690F806D32CKJV5VMI7312|35027312 |ohmdTt02NZkLW5HWHTBDd9Y-70nA|2019-02-21 18:01:08.0|2019-03-07 00:00:00.0|
|TW168BB857F872K3IBGBZ4371|35124371 |ohmdTt6cxAb9I1lurmOwwRIdE5Vo|2019-02-05 10:38:34.0|2019-03-07 00:00:00.0|
|TW169369EAAE1WJAXWWYW4773|35134773 |ohmdTt51puuJsu-KxxN8tVB2UZM4|2019-03-01 08:19:21.0|2019-03-07 00:00:00.0|
|TW169481F88121OZFRZCE4358|35214358 |ohmdTt2tfioQ-up5PgVtqkTh7gLI|2019-03-04 17:53:39.0|2019-03-07 00:00:00.0|
|TW168E5B3C4CBFLEWSMAP5926|35345926 |ohmdTt4ASc4c21Y2wr7yy-PeyOfI|2019-02-13 15:13:10.0|2019-03-07 00:00:00.0|
|TW169191389684RNWPYBZ4698|35384698 |ohmdTt3jeBTqb7mb0wLCqDgYZr78|2019-02-23 14:38:24.0|2019-03-07 00:00:00.0|
|TW1690B463A43OMIA9IVD3954|35423954 |ohmdTt5zKdqTIuS3lI9LwPXcYoN0|2019-02-20 22:19:05.0|2019-03-07 00:00:00.0|
|TW1693C010D8DCPC8WDFQ4192|35434192 |ohmdTt9vjIi9bNoSWXtEGk3826kQ|2019-03-02 09:24:55.0|2019-03-07 00:00:00.0|
|TW169228B7674RUKQV5FS4145|35484145 |ohmdTt8-Hq48Tnz8uSXjxFq02Ph8|2019-02-25 10:45:59.0|2019-03-07 00:00:00.0|
|TW1694D4FB3C4Y09PWRRX4747|35494747 |ohmdTt3O6z_v2wmkD4k-7RpTBraQ|2019-03-05 18:04:22.0|2019-03-07 00:00:00.0|
|TW1690F2B45A1PQMODAMP3739|35593739 |ohmdTtztAytxlFe6k9AZ4YuTatC4|2019-02-21 16:28:07.0|2019-03-07 00:00:00.0|
|TW168E5DFB332Z6JODMWQ5532|35635532 |ohmdTtwp6WatNoIz3fItSYhVw0cA|2019-02-13 16:01:09.0|2019-03-07 00:00:00.0|
+-------------------------+---------+----------------------------+---------------------+---------------------+
only showing top 20 rows
root
|-- parent_order_no: string (nullable = true)
|-- member_id: integer (nullable = true)
|-- union_id: string (nullable = true)
|-- create_time: timestamp (nullable = true)
|-- create_date: timestamp (nullable = true)
需要定义字段schema,字段类型全是string
private StructType tableSchema(){
List<StructField> inputFields=new ArrayList<>();
String splitSeq=",";
String stringType="parent_order_no,member_id,union_id,create_time,create_date";
for(String stringTmp:stringType.split(splitSeq)){
inputFields.add(DataTypes.createStructField(stringTmp,DataTypes.StringType,true));
}
return DataTypes.createStructType(inputFields);
}
修改代码
private Dataset<Row> offlineTable(SparkSession spark){
Dataset<Row> tableData = spark.read()
// .option("inferschema", "true")
.option("header", "true")
.option("encoding", "gbk")
.schema(tableSchema())
.csv("D:\\data\\csv\\stats_order_has_gone.csv");
tableData.show(false);
tableData.printSchema();
return tableData;
}
结果如下,和源文件字段类型一样
+-------------------------+---------+----------------------------+-------------------+-----------+
|parent_order_no |member_id|union_id |create_time |create_date|
+-------------------------+---------+----------------------------+-------------------+-----------+
|TW168E93658FBHBSATPTX9866|33459866 |ohmdTt1gcNVNORpm_onak1nOTduE|2019-02-14 07:34:39|2019-03-07 |
|TW169383A50F2RPXEQ8TT4073|34224073 |ohmdTt9ajPpDDyGddYy0HMZtDH-s|2019-03-01 15:48:59|2019-03-07 |
|TW168E443C4E4VU1RCLS46953|34276953 |ohmdTt6OsysaxJ9KVN74XASXMZ1A|2019-02-13 08:31:13|2019-03-07 |
|TW168D6B6B869AYWAUTC50242|34680242 |ohmdTtw319HkEVoWiga_D5NjoSCw|2019-02-10 17:22:05|2019-03-07 |
|TW1690B671763YFR3VNRJ6007|34756007 |ohmdTt5lHZmH2fhc6gcOEkEqJezw|2019-02-20 22:54:59|2019-03-07 |
|TW1693D587B38ZKJOBIAQ6699|34756699 |ohmdTtwH4TD1oWhmdTBYdLhIFdzA|2019-03-02 15:40:02|2019-03-07 |
|TW168DAD663EAKWGTBVXW1762|34821762 |ohmdTt3iPO3HzUqbLAp9el9bomFY|2019-02-11 12:35:09|2019-03-07 |
|TW169090283839NTOOS5L6659|35026659 |ohmdTtyh0lC5GvDLNMA08VlvHDCI|2019-02-20 11:45:53|2019-03-07 |
|TW1690F806D32CKJV5VMI7312|35027312 |ohmdTt02NZkLW5HWHTBDd9Y-70nA|2019-02-21 18:01:08|2019-03-07 |
|TW168BB857F872K3IBGBZ4371|35124371 |ohmdTt6cxAb9I1lurmOwwRIdE5Vo|2019-02-05 10:38:34|2019-03-07 |
|TW169369EAAE1WJAXWWYW4773|35134773 |ohmdTt51puuJsu-KxxN8tVB2UZM4|2019-03-01 08:19:21|2019-03-07 |
|TW169481F88121OZFRZCE4358|35214358 |ohmdTt2tfioQ-up5PgVtqkTh7gLI|2019-03-04 17:53:39|2019-03-07 |
|TW168E5B3C4CBFLEWSMAP5926|35345926 |ohmdTt4ASc4c21Y2wr7yy-PeyOfI|2019-02-13 15:13:10|2019-03-07 |
|TW169191389684RNWPYBZ4698|35384698 |ohmdTt3jeBTqb7mb0wLCqDgYZr78|2019-02-23 14:38:24|2019-03-07 |
|TW1690B463A43OMIA9IVD3954|35423954 |ohmdTt5zKdqTIuS3lI9LwPXcYoN0|2019-02-20 22:19:05|2019-03-07 |
|TW1693C010D8DCPC8WDFQ4192|35434192 |ohmdTt9vjIi9bNoSWXtEGk3826kQ|2019-03-02 09:24:55|2019-03-07 |
|TW169228B7674RUKQV5FS4145|35484145 |ohmdTt8-Hq48Tnz8uSXjxFq02Ph8|2019-02-25 10:45:59|2019-03-07 |
|TW1694D4FB3C4Y09PWRRX4747|35494747 |ohmdTt3O6z_v2wmkD4k-7RpTBraQ|2019-03-05 18:04:22|2019-03-07 |
|TW1690F2B45A1PQMODAMP3739|35593739 |ohmdTtztAytxlFe6k9AZ4YuTatC4|2019-02-21 16:28:07|2019-03-07 |
|TW168E5DFB332Z6JODMWQ5532|35635532 |ohmdTtwp6WatNoIz3fItSYhVw0cA|2019-02-13 16:01:09|2019-03-07 |
+-------------------------+---------+----------------------------+-------------------+-----------+
only showing top 20 rows
root
|-- parent_order_no: string (nullable = true)
|-- member_id: string (nullable = true)
|-- union_id: string (nullable = true)
|-- create_time: string (nullable = true)
|-- create_date: string (nullable = true)