相关的资源文件地址
链接:https://pan.baidu.com/s/1QGQIrVwg56g9eF16ERSLwQ
提取码:7v8n
读取(load)和保存(write)操作,操作的文件的数据格式默认是parquet
也可以在load和write的时候,通过format指定数据的格式
如:
spark.read().format("json").load()
deviceInfoDF.write().mode(SaveMode.Overwrite).format("orc").save
代码示例
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructType;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
public class test17 {
public static void main(String[] args) {
SparkSession spark = SparkSession
.builder()
.config("spark.driver.host", "localhost")
.appName("BasicTest")
.master("local")
.getOrCreate();
spark.sparkContext().setLogLevel("ERROR");
//最基本的读取(load)和保存(write)操作,操作的文件的数据格式默认是parquet
Dataset<Row> sessionDF = spark.read().load(Utils.BASE_PATH + "/trackerSession");
sessionDF.show(false);
// +------------------------------------+-------------------+-------+------------+---------+----------------------------------+--------------+-----------+---------------+------------+
// |session_id |session_server_time|cookie |cookie_label|ip |landing_url |pageview_count|click_count|domain |domain_label|
// +------------------------------------+-------------------+-------+------------+---------+----------------------------------+--------------+-----------+---------------+------------+
// |520815c9-bdd4-40c5-9ffa-df491dcd97e1|2017-09-04 12:00:00|cookie1|固执 |127.0.0.3|https://www.baidu.com |1 |2 |www.baidu.com |level1 |
// |912a4b47-6984-4763-a704-699ee9724585|2017-09-04 12:45:01|cookie1|固执 |127.0.0.3|https://tieba.baidu.com/index.html|1 |2 |tieba.baidu.com|- |
// |79534f7c-b4dc-4bc6-b021-c05d5ceb634c|2017-09-04 12:00:01|cookie2|有偏见 |127.0.0.4|https://www.baidu.com |3 |1 |www.baidu.com |level1 |
// +------------------------------------+-------------------+-------+------------+---------+----------------------------------+--------------+-----------+---------------+------------+
sessionDF.select("ip", "cookie").write().mode(SaveMode.Overwrite).save(Utils.BASE_PATH + "/trackerSession_ip_cookie");
//可以读取多个文件目录下的数据文件
Dataset<Row> multiSessionDF = spark.read().load(Utils.BASE_PATH + "/trackerSession",
Utils.BASE_PATH + "/trackerSession_ip_cookie");
multiSessionDF.show(false);
// +------------------------------------+-------------------+-------+------------+---------+----------------------------------+--------------+-----------+---------------+------------+
// |session_id |session_server_time|cookie |cookie_label|ip |landing_url |pageview_count|click_count|domain |domain_label|
// +------------------------------------+-------------------+-------+------------+---------+----------------------------------+--------------+-----------+---------------+------------+
// |520815c9-bdd4-40c5-9ffa-df491dcd97e1|2017-09-04 12:00:00|cookie1|固执 |127.0.0.3|https://www.baidu.com |1 |2 |www.baidu.com |level1 |
// |912a4b47-6984-4763-a704-699ee9724585|2017-09-04 12:45:01|cookie1|固执 |127.0.0.3|https://tieba.baidu.com/index.html|1 |2 |tieba.baidu.com|- |
// |79534f7c-b4dc-4bc6-b021-c05d5ceb634c|2017-09-04 12:00:01|cookie2|有偏见 |127.0.0.4|https://www.baidu.com |3 |1 |www.baidu.com |level1 |
// |null |null |cookie1|null |127.0.0.3|null |null |null |null |null |
// |null |null |cookie1|null |127.0.0.3|null |null |null |null |null |
// |null |null |cookie2|null |127.0.0.4|null |null |null |null |null |
// +------------------------------------+-------------------+-------+------------+---------+----------------------------------+--------------+-----------+---------------+------------+
//读取的时候指定schema
StructType schema =
DataTypes.createStructType(Arrays.asList(DataTypes.createStructField("ip", DataTypes.StringType, true)));
Dataset<Row> specSessionDF =
spark.read().schema(schema).load(Utils.BASE_PATH + "/trackerSession");
specSessionDF.show(false);
// +---------+
// |ip |
// +---------+
// |127.0.0.3|
// |127.0.0.3|
// |127.0.0.4|
// +---------+
//指定数据源数据格式
//读取json文件, 且将读取出来的数据保存为parquet文件
Dataset<Row> deviceInfoDF =
spark.read().format("json").load(Utils.BASE_PATH + "/IoT_device_info.json");
deviceInfoDF.show(false);
// +-------------+---------+----+-------------+---------+-------------+-------------+------+----+----------+
// |battery_level|c02_level|cca3|cn |device_id|device_type |ip |signal|temp|timestamp |
// +-------------+---------+----+-------------+---------+-------------+-------------+------+----+----------+
// |8 |917 |USA |United States|0 |sensor-ipad |68.161.225.1 |23 |25 |1475600496|
// |6 |1413 |NOR |Norway |1 |sensor-igauge|213.161.254.1|18 |30 |1475600498|
// |5 |1372 |ITA |Italy |2 |sensor-ipad |88.36.5.1 |25 |18 |1475600500|
// |1 |1447 |USA |United States|3 |sensor-inest |66.39.173.154|12 |47 |1475600502|
// +-------------+---------+----+-------------+---------+-------------+-------------+------+----+----------+
deviceInfoDF.write().mode(SaveMode.Overwrite).format("orc").save(Utils.BASE_PATH + "/iot");
deviceInfoDF.write().mode(SaveMode.Overwrite).orc(Utils.BASE_PATH + "/iot2");
//option传递参数,改变读写数据源的行为
spark.read().option("mergeSchema", "true").parquet(Utils.BASE_PATH + "/trackerSession");
deviceInfoDF.write().mode(SaveMode.Overwrite).option("compression", "snappy").parquet(Utils.BASE_PATH + "/iot3");
Map<String, String> optMap = new HashMap<>();
optMap.put("mergeSchema", "true");
spark.read().options(optMap).parquet(Utils.BASE_PATH + "/trackerSession");
//SaveMode
//SaveMode.ErrorIfExists(对应着字符串"error"):表示如果目标文件目录中数据已经存在了,则抛异常(这个是默认的配置)
//SaveMode.Append(对应着字符串"append"):表示如果目标文件目录中数据已经存在了,则将数据追加到目标文件中
//SaveMode.Overwrite(对应着字符串"overwrite"):表示如果目标文件目录中数据已经存在了,则用需要保存的数据覆盖掉已经存在的数据
//SaveMode.Ignore(对应着字符串为:"ignore"):表示如果目标文件目录中数据已经存在了,则不做任何操作
deviceInfoDF.write().option("compression", "snappy")
.mode(SaveMode.Overwrite).parquet(Utils.BASE_PATH + "/iot3");
deviceInfoDF.write().option("compression", "snappy")
.mode("overwrite").parquet(Utils.BASE_PATH + "/iot3");
spark.stop();
}
}