相关的资源文件地址
链接:https://pan.baidu.com/s/1QGQIrVwg56g9eF16ERSLwQ
提取码:7v8n
json文件读写示例
import org.apache.spark.sql.*;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import java.util.*;
public class test8 {
public static void main(String[] args) {
SparkSession spark = SparkSession
.builder()
.config("spark.driver.host", "localhost")
.appName("JsonFileTest")
.master("local")
.getOrCreate();
spark.sparkContext().setLogLevel("ERROR");
//将parquet文件数据转化成json文件数据
Dataset<Row> sessionDf = spark.read().parquet(Utils.BASE_PATH + "/trackerSession");
sessionDf.show(false);
//+------------------------------------+-------------------+-------+------------+---------+----------------------------------+--------------+-----------+---------------+------------+
//|session_id |session_server_time|cookie |cookie_label|ip |landing_url |pageview_count|click_count|domain |domain_label|
//+------------------------------------+-------------------+-------+------------+---------+----------------------------------+--------------+-----------+---------------+------------+
//|520815c9-bdd4-40c5-9ffa-df491dcd97e1|2017-09-04 12:00:00|cookie1|固执 |127.0.0.3|https://www.baidu.com |1 |2 |www.baidu.com |level1 |
//|912a4b47-6984-4763-a704-699ee9724585|2017-09-04 12:45:01|cookie1|固执 |127.0.0.3|https://tieba.baidu.com/index.html|1 |2 |tieba.baidu.com|- |
//|79534f7c-b4dc-4bc6-b021-c05d5ceb634c|2017-09-04 12:00:01|cookie2|有偏见 |127.0.0.4|https://www.baidu.com |3 |1 |www.baidu.com |level1 |
// +------------------------------------+-------------------+-------+------------+---------+----------------------------------+--------------+-----------+---------------+------------+
sessionDf.write().mode(SaveMode.Overwrite).json(Utils.BASE_PATH + "/json");
//读取json文件数据
Dataset<Row> jsonDF = spark.read().json(Utils.BASE_PATH + "/json");
jsonDF.show(false);
//+-----------+-------+------------+---------------+------------+---------+----------------------------------+--------------+------------------------------------+-------------------+
//|click_count|cookie |cookie_label|domain |domain_label|ip |landing_url |pageview_count|session_id |session_server_time|
//+-----------+-------+------------+---------------+------------+---------+----------------------------------+--------------+------------------------------------+-------------------+
//|2 |cookie1|固执 |www.baidu.com |level1 |127.0.0.3|https://www.baidu.com |1 |520815c9-bdd4-40c5-9ffa-df491dcd97e1|2017-09-04 12:00:00|
//|2 |cookie1|固执 |tieba.baidu.com|- |127.0.0.3|https://tieba.baidu.com/index.html|1 |912a4b47-6984-4763-a704-699ee9724585|2017-09-04 12:45:01|
//|1 |cookie2|有偏见 |www.baidu.com |level1 |127.0.0.4|https://www.baidu.com |3 |79534f7c-b4dc-4bc6-b021-c05d5ceb634c|2017-09-04 12:00:01|
//+-----------+-------+------------+---------------+------------+---------+----------------------------------+--------------+------------------------------------+-------------------+
//可以从JSON Dataset(类型为String)中创建一个DF
List<String> jsonList =
Arrays.asList("{\"name\":\"Yin\",\"address\":{\"is_old\":true,\"area\":23000.34}}");
Dataset<String> jsonDataset = spark.createDataset(jsonList, Encoders.STRING());
jsonDataset.show(false);
// +--------------------------------------------------------+
// |value |
// +--------------------------------------------------------+
// |{"name":"Yin","address":{"is_old":true,"area":23000.34}}|
// +--------------------------------------------------------+
//通过Dataset<String>读取json
Dataset<Row> jsonDFFromDS = spark.read().json(jsonDataset);
jsonDFFromDS.show(false);
// +---------------+----+
// |address |name|
// +---------------+----+
// |[23000.34,true]|Yin |
// +---------------+----+
//读参数的设置
Map<String, String> readOpts = new HashMap<>();
//将所有 原始类型 推断为 字符串类型
readOpts.put("primitivesAsString", "true");
//忽略JSON记录中的Java / C ++样式注释
readOpts.put("allowComments", "true");
//通过Dataset<String>读取json
jsonDFFromDS = spark.read().options(readOpts).json(jsonDataset);
jsonDFFromDS.show(false);
// +---------------+----+
// |address |name|
// +---------------+----+
// |[23000.34,true]|Yin |
// +---------------+----+
//写参数的设置
Map<String, String> writeOpts = new HashMap<>();
//保存到文件时使用的压缩编解码器
writeOpts.put("compression", "gzip");
writeOpts.put("dateFormat", "yyyy/MM/dd");
List<StructField> fields = new ArrayList<>();
StructField name = DataTypes.createStructField("name", DataTypes.StringType, true);
StructField date = DataTypes.createStructField("date", DataTypes.DateType, true);
fields.add(name);
fields.add(date);
StructType customSchema = DataTypes.createStructType(fields);
List<String> dateJsonList = Arrays.asList("{'name':'Yin','date':'26/08/2015 18:00'}");
Dataset<String> dateJsonDataset = spark.createDataset(dateJsonList, Encoders.STRING());
dateJsonDataset.show(false);
// +----------------------------------------+
// |value |
// +----------------------------------------+
// |{'name':'Yin','date':'26/08/2015 18:00'}|
// +----------------------------------------+
//通过Dataset<String>读取json,同时带有schema信息 和 读取的参数相关
Dataset<Row> dateJsonDFFromDS =
spark.read().schema(customSchema).option("dateFormat", "dd/MM/yyyy HH:mm").json(dateJsonDataset);
dateJsonDFFromDS.write().mode(SaveMode.Overwrite)
.options(writeOpts).json(Utils.BASE_PATH + "/json_date");
spark.read().json(Utils.BASE_PATH + "/json_date").show(false);
// +----------+----+
// |date |name|
// +----------+----+
// |2015/08/26|Yin |
// +----------+----+
spark.stop();
}
}