一篇文章学会 Spark 数据读写代码开发

前言

本文隶属于专栏《大数据技术体系》,该专栏为笔者原创,引用请注明来源,不足和错误之处请在评论区帮忙指出,谢谢!

本专栏目录结构和参考文献请见大数据技术体系


源码下载

spark-examples 代码已开源,本项目致力于提供最具实践性的 Apache Spark 代码开发学习指南。

点击链接前往 github 下载源码:spark-examples


正文

Collection

package com.shockang.study.spark.core.read

import org.apache.spark.{SparkConf, SparkContext}

object ReadCollectionRDDExample {
  def main(args: Array[String]): Unit = {
    // 初始化 SparkConf 对象,设置基本任务参数
    val conf: SparkConf = new SparkConf()
      // 设置提交任务的目标 Master 机器地址,local 为本地运行,[*]为自动分配任务线程数
      .setMaster("local[*]")
      // 设置任务名称
      .setAppName("ReadObjectRDDExample")

    //创建SparkContext对象,通过传入SparkConf实例来定制Spark运行的具体参数和配置信息
    val sc = new SparkContext(conf)

    val numbers = 1 to 100
    val rdd = sc.parallelize(numbers)

    //1+2=3 3+3= 6 6+4=10 .....
    val sum = rdd.reduce(_ + _)

    println("1+2+....+ 99+100 = " + sum)
  }

}
package com.shockang.study.spark.sql.read

import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SparkSession}

/**
 * 读取 csv/tsv 文件
 *
 * @author Shockang
 */
object ReadCollectionDataFrameExample {
  def main(args: Array[String]): Unit = {

    // 使用 Spark 2.0 提供的 SparkSession API 来访问应用程序
    val spark = SparkSession.builder().master("local[*]").appName("ReadCsvDataFrameExample").getOrCreate()

    val df1 = spark.createDataFrame(List(
      ("Alice", "Female", "20"),
      ("Tom", "Male", "25"),
      ("Boris", "Male", "18"))).toDF("name", "sex", "age")
    df1.show()

    val schema = StructType(List(
      StructField("name", StringType, nullable = true),
      StructField("age", IntegerType, nullable = true),
      StructField("sex", StringType, nullable = true)
    ))

    val javaList = new java.util.ArrayList[Row]()
    javaList.add(Row("Alice", 20, "Female"))
    javaList.add(Row("Tom", 18, "Male"))
    javaList.add(Row("Boris", 30, "Male"))
    val df2 = spark.createDataFrame(javaList, schema)
    df2.show

    // 停止sc,结束该任务
    spark.stop()
  }
}

CSV

package com.shockang.study.spark.core.read

import com.shockang.study.spark.READ_DATA_DIR
import org.apache.spark.{SparkConf, SparkContext}

object ReadCsvRDDExample {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
      .setMaster("local[*]")
      .setAppName("ReadCsvRDDExample")
    val sc = new SparkContext(conf)
    sc.setLogLevel("ERROR")

    //读取CSV文件
    val inputCSVFile = sc.textFile(READ_DATA_DIR + "people.csv").flatMap(_.split(",")).collect
    inputCSVFile.foreach(println)
    //读取TSV文件
    val inputTSVFile = sc.textFile(READ_DATA_DIR + "people.tsv").flatMap(_.split("\t")).collect
    inputTSVFile.foreach(println)

    //停止sc,结束该任务
    sc.stop()
  }
}

package com.shockang.study.spark.sql.read

import com.shockang.study.spark._
import org.apache.spark.sql.SparkSession

/**
 * 读取 csv/tsv 文件
 *
 * @author Shockang
 */
object ReadCsvDataFrameExample {
  def main(args: Array[String]): Unit = {

    // 使用 Spark 2.0 提供的 SparkSession API 来访问应用程序
    val spark = SparkSession.builder().master("local[*]").appName("ReadCsvDataFrameExample").getOrCreate()
    // 读取 csv 文件
    val inputCsvFile = spark.read.csv(READ_DATA_DIR + "people.csv")
    // 结果日志输出
    printArray(inputCsvFile.collect)

    // 读取 tsv 文件
    val inputTsvFile = spark.read.option("delimiter", "\\t").csv(READ_DATA_DIR + "people.tsv")
    // 结果日志输出
    printArray(inputTsvFile.collect)

    // 停止sc,结束该任务
    spark.stop()
  }
}

Hadoop

package com.shockang.study.spark.core.read

import com.shockang.study.spark._
import org.apache.hadoop.io.{LongWritable, Text}
import org.apache.hadoop.mapred
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object ReadHadoopRDDExample {
  def main(args: Array[String]): Unit = {
    // 初始化 SparkConf 对象,设置基本任务参数
    val conf: SparkConf = new SparkConf()
      // 设置提交任务的目标 Master 机器地址,local 为本地运行,[*]为自动分配任务线程数
      .setMaster("local[*]")
      // 设置任务名称
      .setAppName("ReadHadoopRDDExample")
    // 实例化 SparkContext
    val sc: SparkContext = new SparkContext(conf)

    val path: String = HDFS_READ_DIR + "people.txt"

    // 使用 Hadoop 旧 API
    val oldHadoopRDD: RDD[(LongWritable, Text)] = sc.hadoopFile[LongWritable, Text, mapred.TextInputFormat](path)
    printArray(oldHadoopRDD.map(_._2.toString).collect)

    // 使用 Hadoop 新 API
    val newHadoopRDD: RDD[(LongWritable, Text)] = sc.newAPIHadoopFile[LongWritable, Text, TextInputFormat](path)
    printArray(newHadoopRDD.map(_._2.toString).collect)

    sc.stop
  }
}

JSON

package com.shockang.study.spark.core.read

import com.shockang.study.spark.READ_DATA_DIR
import org.apache.spark.{SparkConf, SparkContext}

import scala.util.parsing.json.JSON

object ReadJsonRDDExample {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
      .setMaster("local[*]")
      .setAppName("ReadJsonRDDExample")
    val sc = new SparkContext(conf)

    val inputJsonFile = sc.textFile(READ_DATA_DIR + "people.json")

    val content = inputJsonFile.map(JSON.parseFull)
    println(content.collect.mkString("\t"))

    content.foreach(
      {
        case Some(map: Map[String, Any]) => println(map)
        case None => println("无效的Json")
        case _ => println("其他异常")
      }
    )

    sc.stop()
  }
}

package com.shockang.study.spark.sql.read

import com.shockang.study.spark._
import org.apache.spark.sql.SparkSession

/**
 * 读取 json 文件
 *
 * @author Shockang
 */
object ReadJsonDataFrameExample {

  def main(args: Array[String]): Unit = {

    // 使用 Spark 2.0 提供的 SparkSession API 来访问应用程序
    val spark = SparkSession.builder().master("local[*]").appName("ReadJsonDataFrameExample").getOrCreate()

    // 读取文本文件
    val inputJsonFile = spark.read.json(READ_DATA_DIR + "employees.json")

    printArray(inputJsonFile.collect())
    // 停止 spark,结束该任务
    spark.stop()
  }
}

MySQL

package com.shockang.study.spark.core.read

import com.shockang.study.spark.{MYSQL_JDBC_URL, MYSQL_PASS, MYSQL_USER}
import org.apache.spark.rdd.JdbcRDD
import org.apache.spark.{SparkConf, SparkContext}

import java.sql.DriverManager

object ReadMysqlRDDExample {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
      .setMaster("local[*]")
      .setAppName("ReadMysqlRDDExample")
    val sc = new SparkContext(conf)

    val inputMysql = new JdbcRDD(sc,
      () => {
        Class.forName("com.mysql.jdbc.Driver")
        DriverManager.getConnection(MYSQL_JDBC_URL, MYSQL_USER, MYSQL_PASS)
      },
      "SELECT * FROM person WHERE id >= ? and id <= ?;",
      1,
      3,
      1,
      r => (r.getInt(1), r.getString(2), r.getInt(3)))

    println("查询到的记录条目数:" + inputMysql.count)
    inputMysql.foreach(println)

    sc.stop()
  }
}

package com.shockang.study.spark.sql.read

import com.shockang.study.spark.{MYSQL_DB_TABLE, MYSQL_JDBC_URL, MYSQL_PASS, MYSQL_USER}
import org.apache.spark.sql.SparkSession

import java.util.Properties

/**
 *
 * @author Shockang
 */
object ReadMysqlDataFrameExample {
  def main(args: Array[String]): Unit = {
    // 使用 Spark 2.0 提供的 SparkSession API 来访问应用程序
    val spark = SparkSession.builder().master("local[*]").appName("ReadCsvDataFrameExample").getOrCreate()

    val jdbcDF = spark.read
      .format("jdbc")
      .option("url", MYSQL_JDBC_URL)
      .option("dbtable", MYSQL_DB_TABLE)
      .option("user", MYSQL_USER)
      .option("password", MYSQL_PASS)
      .load()
    jdbcDF.show()

    val connectionProperties = new Properties()
    connectionProperties.put("user", MYSQL_USER)
    connectionProperties.put("password", MYSQL_PASS)
    val jdbcDF2 = spark.read
      .jdbc(MYSQL_JDBC_URL, MYSQL_DB_TABLE, connectionProperties)
    jdbcDF2.show()

    connectionProperties.put("customSchema", "id DECIMAL(38, 0), name STRING")
    val jdbcDF3 = spark.read
      .jdbc(MYSQL_JDBC_URL, MYSQL_DB_TABLE, connectionProperties)
    jdbcDF3.show()
  }
}

Object 文件

package com.shockang.study.spark.core.read

import com.shockang.study.spark._
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

/**
 * 读取 object 文件
 *
 * @author Shockang
 */
object ReadObjectRDDExample {

  case class Person(name: String, age: Int)

  def main(args: Array[String]): Unit = {

    // 初始化 SparkConf 对象,设置基本任务参数
    val conf: SparkConf = new SparkConf()
      // 设置提交任务的目标 Master 机器地址,local 为本地运行,[*]为自动分配任务线程数
      .setMaster("local[*]")
      // 设置任务名称
      .setAppName("ReadObjectRDDExample")

    // 实例化 SparkContext
    val sc: SparkContext = new SparkContext(conf)

    val filePath: String = READ_DATA_DIR + "people.object"
    val rdd: RDD[Person] = sc.objectFile[Person](filePath)

    printArray(rdd.collect)

    sc.stop()
  }
}

SequenceFile

package com.shockang.study.spark.core.read

import com.shockang.study.spark._
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

/**
 * 读取 SequenceFile
 *
 * @author Shockang
 */
object ReadSequenceRDDExample {
  def main(args: Array[String]): Unit = {

    // 初始化 SparkConf 对象,设置基本任务参数
    val conf: SparkConf = new SparkConf()
      // 设置提交任务的目标 Master 机器地址,local 为本地运行,[*]为自动分配任务线程数
      .setMaster("local[*]")
      // 设置任务名称
      .setAppName("ReadSequenceRDDExample")
    // 实例化 SparkContext
    val sc: SparkContext = new SparkContext(conf)
    val path: String = READ_DATA_DIR + "people.sequence"

    // sequenceFile 应当使用 Hadoop 的 Writable 接口实现类,像 IntWritable 和 Text
    // Spark 也允许使用一些原生类型,比如, sequenceFile[Int, String] 会自动转换成 IntWritable 和 Text
    val rdd: RDD[(String, String)] = sc.sequenceFile[String, String](path)
    printArray(rdd.collect)
    sc.stop()
  }
}

TXT

package com.shockang.study.spark.core.read

import com.shockang.study.spark._
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

/**
 * 读取 txt 文件
 *
 * @author Shockang
 */
object ReadTxtRDDExample {
  def main(args: Array[String]): Unit = {

    // 初始化 SparkConf 对象,设置基本任务参数
    val conf: SparkConf = new SparkConf()
      // 设置提交任务的目标 Master 机器地址,local 为本地运行,[*]为自动分配任务线程数
      .setMaster("local[*]")
      // 设置任务名称
      .setAppName("ReadTxtRDDExample")
    // 实例化 SparkContext
    val sc: SparkContext = new SparkContext(conf)
    sc.setLogLevel("ERROR")

    // 读取文本文件,支持目录,压缩文件和通配符
    val txtRDD: RDD[String] = sc.textFile(READ_DATA_DIR + "people.txt")
    printArray(txtRDD.collect)

    val dirRDD: RDD[String] = sc.textFile(READ_DATA_DIR)
    printArray(dirRDD.collect)

    // 默认支持 gzip 压缩
    val gzipRDD: RDD[String] = sc.textFile(READ_DATA_DIR + "people.gz")
    printArray(gzipRDD.collect)

    val wildcardsRDD: RDD[String] = sc.textFile(READ_DATA_DIR + "*.txt")
    printArray(wildcardsRDD.collect)

    // 支持自定义分区数
    val txtRDDWithMinPartitions: RDD[String] = sc.textFile(READ_DATA_DIR + "people.txt", 1)
    printArray(txtRDDWithMinPartitions.collect)

    // 读取目录下所有文本文件
    val wholeTxtRDD: RDD[(String, String)] = sc.wholeTextFiles(READ_DATA_DIR)
    // 结果日志输出,返回(文件名称,文件内容)对
    printArray(wholeTxtRDD.collect)

    // 读取目录下所有文本文件,同样支持自定义分区数
    val wholeTxtRDDWithMinPartitions: RDD[(String, String)] = sc.wholeTextFiles(READ_DATA_DIR, 1)
    // 结果日志输出,返回(文件名称,文件内容)对
    printArray(wholeTxtRDDWithMinPartitions.collect)

    // 停止sc,结束该任务
    sc.stop()
  }
}

package com.shockang.study.spark.sql.read

import com.shockang.study.spark.{READ_DATA_DIR, printArray}
import org.apache.spark.sql.SparkSession

/**
 * 读取 txt 文件
 *
 * @author Shockang
 */
object ReadTxtDataFrameExample {
  def main(args: Array[String]): Unit = {

    // 使用 Spark 2.0 提供的 SparkSession API 来访问应用程序
    val spark = SparkSession.builder().master("local[*]").appName("ReadTxtDataFrameExample").getOrCreate()

    // 读取 txt 文件
    val inputTextFile = spark.read.textFile(READ_DATA_DIR + "people.txt")

    // 结果日志输出
    printArray(inputTextFile.collect)

    // 读取目录下所有 txt 文本文件
    val allTextFile = spark.read.textFile(READ_DATA_DIR + "*.txt")

    // 结果日志输出
    printArray(allTextFile.collect)

    // 停止sc,结束该任务
    spark.stop()
  }
}

CSV

package com.shockang.study.spark.core.write

import com.shockang.study.spark.WRITE_DATA_DIR
import com.shockang.study.spark.util.Utils.writableLocalFsPath
import org.apache.spark.{SparkConf, SparkContext}

object WriteCsvRDDExample {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
      .setMaster("local[*]")
      .setAppName("WriteCsvRDDExample")
    val sc = new SparkContext(conf)

    val array = Array("Thomas", 18, "male", "65kg", "180cm")

    //转换成CSV格式保存
    val csvRDD = sc.parallelize(Array(array.mkString(",")), 1)
    val csvPath = WRITE_DATA_DIR + "WriteCsvRDDExample1"
    csvRDD.saveAsTextFile(writableLocalFsPath(csvPath))

    //转换成TSV格式保存
    val tsvRDD = sc.parallelize(Array(array.mkString("\t")), 1)
    val tsvPath = WRITE_DATA_DIR + "WriteCsvRDDExample2"
    tsvRDD.saveAsTextFile(writableLocalFsPath(tsvPath))

    sc.stop
  }
}

package com.shockang.study.spark.sql.write

import com.shockang.study.spark.WRITE_DATA_DIR
import com.shockang.study.spark.util.Utils.writableLocalFsPath
import org.apache.spark.sql.SparkSession

/**
 * 写入数据到 csv/tsv 文件
 *
 * @author Shockang
 */
object WriteCsvDataFrameExample {
  def main(args: Array[String]): Unit = {

    // 使用 Spark 2.0 提供的 SparkSession API 来访问应用程序
    val spark = SparkSession.builder().master("local[*]").appName("UsingSqlWriteTxtExample").getOrCreate()

    //转换成CSV格式保存
    val df = spark.createDataFrame(List(("one", 1), ("two", 2), ("three", 3)))
    val csvPath = WRITE_DATA_DIR + "WriteCsvDataFrameExample1"
    df.write.csv(writableLocalFsPath(csvPath))

    //转换成TSV格式保存
    val tsvPath = WRITE_DATA_DIR + "WriteCsvDataFrameExample2"
    df.write.option("delimiter", "\\t").csv(writableLocalFsPath(tsvPath))

    spark.stop
  }
}

Hadoop

package com.shockang.study.spark.core.write


import com.shockang.study.spark._
import org.apache.hadoop.io.{LongWritable, Text}
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object WriteHadoopRDDExample {
  def main(args: Array[String]): Unit = {
    // 初始化 SparkConf 对象,设置基本任务参数
    val conf: SparkConf = new SparkConf()
      // 设置提交任务的目标 Master 机器地址,local 为本地运行,[*]为自动分配任务线程数
      .setMaster("local[*]")
      // 设置任务名称
      .setAppName("WriteHadoopRDDExample")
    // 实例化 SparkContext
    val sc: SparkContext = new SparkContext(conf)

    val rdd: RDD[(String, String)] = sc.parallelize(Seq(("Michael", "29"), ("Andy", "30"), ("Justin", "19")), 1).cache()

    // 使用 Hadoop 旧 API
    val oldPath: String = HDFS_WRITE_DIR + "WriteHadoopRDDExampleWithOldAPI"
    rdd.saveAsHadoopFile(oldPath, classOf[LongWritable], classOf[Text], classOf[org.apache.hadoop.mapred.TextOutputFormat[LongWritable, Text]])

    // 使用 Hadoop 新 API
    val newPath: String = HDFS_WRITE_DIR + "WriteHadoopRDDExampleWithNewAPI"
    rdd.saveAsNewAPIHadoopFile(newPath, classOf[LongWritable], classOf[Text], classOf[TextOutputFormat[LongWritable, Text]])

    sc.stop
  }
}

JSON

package com.shockang.study.spark.core.write

import com.shockang.study.spark.WRITE_DATA_DIR
import com.shockang.study.spark.util.Utils.writableLocalFsPath
import org.apache.spark.{SparkConf, SparkContext}

import scala.util.parsing.json.{JSONArray, JSONObject}

object WriteJsonRDDExample {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
      .setMaster("local[*]")
      .setAppName("WriteJsonRDDExample")
    val sc = new SparkContext(conf)

    val map1 = Map("name" -> "Thomas", "age" -> "20", "address" -> JSONArray(List("通信地址1", "通信地址2")))
    val map2 = Map("name" -> "Alice", "age" -> "18", "address" -> JSONArray(List("通信地址1", "通信地址2", "通信地址3")))

    val rddData = sc.parallelize(List(JSONObject(map1), JSONObject(map2)), 1)
    val tsvPath = WRITE_DATA_DIR + "WriteJsonRDDExample"
    rddData.saveAsTextFile(writableLocalFsPath(tsvPath))

    sc.stop()
  }
}

package com.shockang.study.spark.sql.write

import com.shockang.study.spark._
import com.shockang.study.spark.util.Utils.writableLocalFsPath
import org.apache.spark.sql.SparkSession

/**
 * 写入数据到 json 文件
 *
 * @author Shockang
 */
object WriteJsonDataFrameExample {

  def main(args: Array[String]): Unit = {

    // 使用 Spark 2.0 提供的 SparkSession API 来访问应用程序
    val spark = SparkSession.builder().master("local[*]").appName("UsingSqlWriteTxtExample").getOrCreate()

    //转换成CSV格式保存
    val df = spark.createDataFrame(List(("one", 1), ("two", 2), ("three", 3)))
    val csvPath = WRITE_DATA_DIR + "WriteCsvDataFrameExample1"
    df.write.csv(writableLocalFsPath(csvPath))
  }
}

MySQL

package com.shockang.study.spark.core.write

import com.shockang.study.spark.{MYSQL_JDBC_URL, MYSQL_PASS, MYSQL_USER}
import org.apache.spark.{SparkConf, SparkContext}

import java.sql.DriverManager

object WriteMysqlRDDExample {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
      .setMaster("local[*]")
      .setAppName("WriteMysqlRDDExample")
    val sc = new SparkContext(conf)

    Class.forName("com.mysql.jdbc.Driver")

    val rddData = sc.parallelize(List(("Alice", 30), ("Kotlin", 37)))
    rddData.foreachPartition((iter: Iterator[(String, Int)]) => {
      val conn = DriverManager.getConnection(MYSQL_JDBC_URL, MYSQL_USER, MYSQL_PASS)
      conn.setAutoCommit(false)
      val preparedStatement = conn.prepareStatement("INSERT INTO spark_examples.person (`name`, `age`) VALUES (?, ?);")
      iter.foreach(t => {
        preparedStatement.setString(1, t._1)
        preparedStatement.setInt(2, t._2)
        preparedStatement.addBatch()
      })
      preparedStatement.executeBatch()
      conn.commit()
      conn.close()
    })
    sc.stop()
  }
}

package com.shockang.study.spark.sql.write

import com.shockang.study.spark.{MYSQL_DB_TABLE, MYSQL_JDBC_URL, MYSQL_PASS, MYSQL_USER}
import org.apache.spark.sql.SparkSession

import java.util.Properties

/**
 *
 * @author Shockang
 */
object WriteMysqlDataFrameExample {
  def main(args: Array[String]): Unit = {
    // 使用 Spark 2.0 提供的 SparkSession API 来访问应用程序
    val spark = SparkSession.builder().master("local[*]").appName("ReadCsvDataFrameExample").getOrCreate()

    val df = spark.createDataFrame(List(
      ("Alice", "Female", "20"),
      ("Tom", "Male", "25"),
      ("Boris", "Male", "18"))).toDF("name", "sex", "age")

    // Saving data to a JDBC source
    df.write
      .format("jdbc")
      .option("url", MYSQL_JDBC_URL)
      .option("dbtable", MYSQL_DB_TABLE)
      .option("user", MYSQL_USER)
      .option("password", MYSQL_PASS)
      .save()

    val connectionProperties = new Properties()
    connectionProperties.put("user", MYSQL_USER)
    connectionProperties.put("password", MYSQL_PASS)

    df.write
      .jdbc(MYSQL_JDBC_URL, MYSQL_DB_TABLE, connectionProperties)

    df.write
      .option("createTableColumnTypes", "name CHAR(64), comments VARCHAR(1024)")
      .jdbc(MYSQL_JDBC_URL, MYSQL_DB_TABLE, connectionProperties)
  }
}

Object 文件

package com.shockang.study.spark.core.write

import com.shockang.study.spark._
import com.shockang.study.spark.core.read.ReadObjectRDDExample.Person
import com.shockang.study.spark.util.Utils.writableLocalFsPath
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

/**
 *
 * 写入数据到 object 文件
 *
 * @author Shockang
 */
object WriteObjectRDDExample {
  def main(args: Array[String]): Unit = {

    // 初始化 SparkConf 对象,设置基本任务参数
    val conf: SparkConf = new SparkConf()
      // 设置提交任务的目标 Master 机器地址,local 为本地运行,[*]为自动分配任务线程数
      .setMaster("local[*]")
      // 设置任务名称
      .setAppName("WriteObjectRDDExample")
    // 实例化 SparkContext
    val sc: SparkContext = new SparkContext(conf)

    val rdd: RDD[Person] = sc.parallelize(Seq(Person("小明", 20), Person("Alice", 18)), 1)
    val filePath: String = WRITE_DATA_DIR + "WriteObjectRDDExample"
    rdd.saveAsObjectFile(writableLocalFsPath(filePath))

    sc.stop()
  }
}

SequenceFile

package com.shockang.study.spark.core.write

import com.shockang.study.spark._
import com.shockang.study.spark.util.Utils.writableLocalFsPath
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

/**
 *
 * 写入数据到 SequenceFile
 *
 * @author Shockang
 */
object WriteSequenceRDDExample {
  def main(args: Array[String]): Unit = {

    // 初始化 SparkConf 对象,设置基本任务参数
    val conf: SparkConf = new SparkConf()
      // 设置提交任务的目标 Master 机器地址,local 为本地运行,[*]为自动分配任务线程数
      .setMaster("local[*]")
      // 设置任务名称
      .setAppName("WriteSequenceRDDExample")
    // 实例化 SparkContext
    val sc: SparkContext = new SparkContext(conf)
    val rdd: RDD[(String, String)] = sc.parallelize(Seq(("姓名", "小明"), ("年龄", "18")), 1)
    val path: String = WRITE_DATA_DIR + "WriteSequenceExample"
    rdd.saveAsSequenceFile(writableLocalFsPath(path))

    sc.stop()
  }
}

TXT

package com.shockang.study.spark.core.write

import com.shockang.study.spark._
import com.shockang.study.spark.util.Utils.writableLocalFsPath
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

/**
 * 写入数据到 txt 文件
 *
 * @author Shockang
 */
object WriteTxtRDDExample {
  def main(args: Array[String]): Unit = {

    // 初始化 SparkConf 对象,设置基本任务参数
    val conf: SparkConf = new SparkConf()
      // 设置提交任务的目标 Master 机器地址,local 为本地运行,[*]为自动分配任务线程数
      .setMaster("local[*]")
      // 设置任务名称
      .setAppName("WriteTxtRDDExample")
    // 实例化 SparkContext
    val sc: SparkContext = new SparkContext(conf)

    val rdd: RDD[(String, Int)] = sc.parallelize(Array(("one", 1), ("two", 2), ("three", 3)), 1)
    val filePath: String = WRITE_DATA_DIR + "WriteTxtRDDExample"
    rdd.saveAsTextFile(writableLocalFsPath(filePath))

    sc.stop
  }
}

package com.shockang.study.spark.sql.write

import com.shockang.study.spark.WRITE_DATA_DIR
import com.shockang.study.spark.util.Utils.writableLocalFsPath
import org.apache.spark.sql.SparkSession

/**
 * 写入数据到 txt 文件
 *
 * @author Shockang
 */
object WriteTxtDataFrameExample {

  case class Person(name: String, sex: String, age: Int)

  def main(args: Array[String]): Unit = {

    // 使用 Spark 2.0 提供的 SparkSession API 来访问应用程序
    val spark = SparkSession.builder().master("local[*]").appName("UsingSqlWriteTxtExample").getOrCreate()

    // 注意隐式导入
    import spark.implicits._

    // 由于 txt 只支持 String 类型的数据,故需要经过转换
    val df = spark.createDataFrame(List(("one", 1), ("two", 2), ("three", 3))).map(_.mkString("(", ", ", ")"))

    val filePath = WRITE_DATA_DIR + "WriteTxtDataFrameExample"

    df.write.text(writableLocalFsPath(filePath))

    spark.stop
  }
}

Parquet

package com.shockang.study.spark.sql.read

import com.shockang.study.spark._
import org.apache.spark.sql.SparkSession

/**
 *
 * @author Shockang
 */
object ParquetExample {
  def main(args: Array[String]): Unit = {
    // 使用 Spark 2.0 提供的 SparkSession API 来访问应用程序
    val spark = SparkSession.builder().master("local[*]").appName("ParquetExample").getOrCreate()

    import spark.implicits._

    val peopleDF = spark.read.json(READ_DATA_DIR + "people.json")

    peopleDF.write.parquet(WRITE_DATA_DIR + "people.parquet")

    val parquetFileDF = spark.read.parquet(WRITE_DATA_DIR + "people.parquet")

    parquetFileDF.createOrReplaceTempView("parquetFile")
    val namesDF = spark.sql("SELECT name FROM parquetFile WHERE age BETWEEN 10 AND 20")
    namesDF.map(attributes => "Name: " + attributes(0)).show()
  }
}

  • 2
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值