准备person.json文件
{"name":"Michael", "age":29}
{"name":"Andy", "age":30}
{"name":"Justin", "age":19}
csv文件读写操作示例
import org.apache.spark.sql.*;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import java.util.*;
public class test5 {
public static void main(String[] args) {
SparkSession spark = SparkSession
.builder()
.config("spark.driver.host", "localhost")
.appName("CSVFileTest")
.master("local")
.getOrCreate();
spark.sparkContext().setLogLevel("ERROR");
Dataset<Row> jsonDF = spark.read().json(Utils.BASE_PATH + "/people.json");
jsonDF.show(false);
// +---+-------+
// | age | name |
// +---+-------+
// | 29 | Michael |
// |30 | Andy |
// |19 | Justin |
// +---+-------+
//将json文件数据转化成csv文件数据
jsonDF.write().mode(SaveMode.Overwrite).csv(Utils.BASE_PATH + "/csv");
Dataset<Row> csvDF = spark.read().csv(Utils.BASE_PATH + "/csv");
csvDF.show(false);
// +---+-------+
// |_c0|_c1 |
// +---+-------+
// |29 |Michael|
// |30 |Andy |
// |19 |Justin |
// +---+-------+
//从String类型中的Dataset来创建DataFrame
List<String> csvStr = Arrays.asList("23,jeffy,26/08/2015 18:00", "34,katy,27/10/2014 18:30");
Dataset<String> csvDS = spark.createDataset(csvStr, Encoders.STRING());
csvDS.show(false);
// +-------------------------+
// |value |
// +-------------------------+
// |23,jeffy,26/08/2015 18:00|
// |34,katy,27/10/2014 18:30 |
// +-------------------------+
//通过Dataset<String>读取csv
Dataset<Row> csvDFFromDS = spark.read().csv(csvDS);
csvDFFromDS.show(false);
// +---+-----+----------------+
// |_c0|_c1 |_c2 |
// +---+-----+----------------+
// |23 |jeffy|26/08/2015 18:00|
// |34 |katy |27/10/2014 18:30|
// +---+-----+----------------+
List<StructField> fields = new ArrayList<>();
StructField age = DataTypes.createStructField("age", DataTypes.IntegerType, true);
StructField name = DataTypes.createStructField("name", DataTypes.StringType, true);
StructField date = DataTypes.createStructField("date", DataTypes.DateType, true);
fields.add(age);
fields.add(name);
fields.add(date);
StructType customSchema = DataTypes.createStructType(fields);
//设置读参数
Map<String, String> readOpts = new HashMap<>();
//不读取列头
readOpts.put("header", "false");
//inferSchema:让框架推断csv文件的数据类型,慢,推荐用自定义Schema来进行代替
readOpts.put("inferSchema",true);
readOpts.put("comment", "~");
//原始的数据集中日期是什么格式,就按什么格式读,否则解析不了
readOpts.put("dateFormat", "dd/MM/yyyy HH:mm");
//通过Dataset<String>读取csv,同时带有schema信息 和 读取的参数options相关
//********注意 schema和options最好同时存在,否则读取的数据可能为null********
Dataset<Row> data = spark.read().schema(customSchema).options(readOpts).csv(csvDS);
data.show(false);
// +---+-----+----------+
// |age|name |date |
// +---+-----+----------+
// |23 |jeffy|2015-08-26|
// |34 |katy |2014-10-27|
// +---+-----+----------+
//设置写参数
Map<String, String> writeOpts = new HashMap<>();
writeOpts.put("comment", "~");
writeOpts.put("compression", "gzip");
//保存时,日期按这个格式存储
writeOpts.put("dateFormat", "yyyy/MM/dd");
data.write().mode(SaveMode.Overwrite).options(writeOpts).csv(Utils.BASE_PATH + "/csv_options");
spark.read().csv(Utils.BASE_PATH + "/csv_options").show(false);
// +---+-----+----------+
// |_c0|_c1 |_c2 |
// +---+-----+----------+
// |23 |jeffy|2015/08/26|
// |34 |katy |2014/10/27|
// +---+-----+----------+
spark.stop();
}
}