前言
本文隶属于专栏《大数据技术体系》,该专栏为笔者原创,引用请注明来源,不足和错误之处请在评论区帮忙指出,谢谢!
本专栏目录结构和参考文献请见大数据技术体系
源码下载
spark-examples 代码已开源,本项目致力于提供最具实践性的 Apache Spark 代码开发学习指南。
点击链接前往 github 下载源码:spark-examples
正文
读
Collection
package com.shockang.study.spark.core.read
import org.apache.spark.{SparkConf, SparkContext}
object ReadCollectionRDDExample {
def main(args: Array[String]): Unit = {
// 初始化 SparkConf 对象,设置基本任务参数
val conf: SparkConf = new SparkConf()
// 设置提交任务的目标 Master 机器地址,local 为本地运行,[*]为自动分配任务线程数
.setMaster("local[*]")
// 设置任务名称
.setAppName("ReadObjectRDDExample")
//创建SparkContext对象,通过传入SparkConf实例来定制Spark运行的具体参数和配置信息
val sc = new SparkContext(conf)
val numbers = 1 to 100
val rdd = sc.parallelize(numbers)
//1+2=3 3+3= 6 6+4=10 .....
val sum = rdd.reduce(_ + _)
println("1+2+....+ 99+100 = " + sum)
}
}
package com.shockang.study.spark.sql.read
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SparkSession}
/**
* 读取 csv/tsv 文件
*
* @author Shockang
*/
object ReadCollectionDataFrameExample {
def main(args: Array[String]): Unit = {
// 使用 Spark 2.0 提供的 SparkSession API 来访问应用程序
val spark = SparkSession.builder().master("local[*]").appName("ReadCsvDataFrameExample").getOrCreate()
val df1 = spark.createDataFrame(List(
("Alice", "Female", "20"),
("Tom", "Male", "25"),
("Boris", "Male", "18"))).toDF("name", "sex", "age")
df1.show()
val schema = StructType(List(
StructField("name", StringType, nullable = true),
StructField("age", IntegerType, nullable = true),
StructField("sex", StringType, nullable = true)
))
val javaList = new java.util.ArrayList[Row]()
javaList.add(Row("Alice", 20, "Female"))
javaList.add(Row("Tom", 18, "Male"))
javaList.add(Row("Boris", 30, "Male"))
val df2 = spark.createDataFrame(javaList, schema)
df2.show
// 停止sc,结束该任务
spark.stop()
}
}
CSV
package com.shockang.study.spark.core.read
import com.shockang.study.spark.READ_DATA_DIR
import org.apache.spark.{SparkConf, SparkContext}
object ReadCsvRDDExample {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setMaster("local[*]")
.setAppName("ReadCsvRDDExample")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
//读取CSV文件
val inputCSVFile = sc.textFile(READ_DATA_DIR + "people.csv").flatMap(_.split(",")).collect
inputCSVFile.foreach(println)
//读取TSV文件
val inputTSVFile = sc.textFile(READ_DATA_DIR + "people.tsv").flatMap(_.split("\t")).collect
inputTSVFile.foreach(println)
//停止sc,结束该任务
sc.stop()
}
}
package com.shockang.study.spark.sql.read
import com.shockang.study.spark._
import org.apache.spark.sql.SparkSession
/**
* 读取 csv/tsv 文件
*
* @author Shockang
*/
object ReadCsvDataFrameExample {
def main(args: Array[String]): Unit = {
// 使用 Spark 2.0 提供的 SparkSession API 来访问应用程序
val spark = SparkSession.builder().master("local[*]").appName("ReadCsvDataFrameExample").getOrCreate()
// 读取 csv 文件
val inputCsvFile = spark.read.csv(READ_DATA_DIR + "people.csv")
// 结果日志输出
printArray(inputCsvFile.collect)
// 读取 tsv 文件
val inputTsvFile = spark.read.option("delimiter", "\\t").csv(READ_DATA_DIR + "people.tsv")
// 结果日志输出
printArray(inputTsvFile.collect)
// 停止sc,结束该任务
spark.stop()
}
}
Hadoop
package com.shockang.study.spark.core.read
import com.shockang.study.spark._
import org.apache.hadoop.io.{LongWritable, Text}
import org.apache.hadoop.mapred
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object ReadHadoopRDDExample {
def main(args: Array[String]): Unit = {
// 初始化 SparkConf 对象,设置基本任务参数
val conf: SparkConf = new SparkConf()
// 设置提交任务的目标 Master 机器地址,local 为本地运行,[*]为自动分配任务线程数
.setMaster("local[*]")
// 设置任务名称
.setAppName("ReadHadoopRDDExample")
// 实例化 SparkContext
val sc: SparkContext = new SparkContext(conf)
val path: String = HDFS_READ_DIR + "people.txt"
// 使用 Hadoop 旧 API
val oldHadoopRDD: RDD[(LongWritable, Text)] = sc.hadoopFile[LongWritable, Text, mapred.TextInputFormat](path)
printArray(oldHadoopRDD.map(_._2.toString).collect)
// 使用 Hadoop 新 API
val newHadoopRDD: RDD[(LongWritable, Text)] = sc.newAPIHadoopFile[LongWritable, Text, TextInputFormat](path)
printArray(newHadoopRDD.map(_._2.toString).collect)
sc.stop
}
}
JSON
package com.shockang.study.spark.core.read
import com.shockang.study.spark.READ_DATA_DIR
import org.apache.spark.{SparkConf, SparkContext}
import scala.util.parsing.json.JSON
object ReadJsonRDDExample {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setMaster("local[*]")
.setAppName("ReadJsonRDDExample")
val sc = new SparkContext(conf)
val inputJsonFile = sc.textFile(READ_DATA_DIR + "people.json")
val content = inputJsonFile.map(JSON.parseFull)
println(content.collect.mkString("\t"))
content.foreach(
{
case Some(map: Map[String, Any]) => println(map)
case None => println("无效的Json")
case _ => println("其他异常")
}
)
sc.stop()
}
}
package com.shockang.study.spark.sql.read
import com.shockang.study.spark._
import org.apache.spark.sql.SparkSession
/**
* 读取 json 文件
*
* @author Shockang
*/
object ReadJsonDataFrameExample {
def main(args: Array[String]): Unit = {
// 使用 Spark 2.0 提供的 SparkSession API 来访问应用程序
val spark = SparkSession.builder().master("local[*]").appName("ReadJsonDataFrameExample").getOrCreate()
// 读取文本文件
val inputJsonFile = spark.read.json(READ_DATA_DIR + "employees.json")
printArray(inputJsonFile.collect())
// 停止 spark,结束该任务
spark.stop()
}
}
MySQL
package com.shockang.study.spark.core.read
import com.shockang.study.spark.{MYSQL_JDBC_URL, MYSQL_PASS, MYSQL_USER}
import org.apache.spark.rdd.JdbcRDD
import org.apache.spark.{SparkConf, SparkContext}
import java.sql.DriverManager
object ReadMysqlRDDExample {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setMaster("local[*]")
.setAppName("ReadMysqlRDDExample")
val sc = new SparkContext(conf)
val inputMysql = new JdbcRDD(sc,
() => {
Class.forName("com.mysql.jdbc.Driver")
DriverManager.getConnection(MYSQL_JDBC_URL, MYSQL_USER, MYSQL_PASS)
},
"SELECT * FROM person WHERE id >= ? and id <= ?;",
1,
3,
1,
r => (r.getInt(1), r.getString(2), r.getInt(3)))
println("查询到的记录条目数:" + inputMysql.count)
inputMysql.foreach(println)
sc.stop()
}
}
package com.shockang.study.spark.sql.read
import com.shockang.study.spark.{MYSQL_DB_TABLE, MYSQL_JDBC_URL, MYSQL_PASS, MYSQL_USER}
import org.apache.spark.sql.SparkSession
import java.util.Properties
/**
*
* @author Shockang
*/
object ReadMysqlDataFrameExample {
def main(args: Array[String]): Unit = {
// 使用 Spark 2.0 提供的 SparkSession API 来访问应用程序
val spark = SparkSession.builder().master("local[*]").appName("ReadCsvDataFrameExample").getOrCreate()
val jdbcDF = spark.read
.format("jdbc")
.option("url", MYSQL_JDBC_URL)
.option("dbtable", MYSQL_DB_TABLE)
.option("user", MYSQL_USER)
.option("password", MYSQL_PASS)
.load()
jdbcDF.show()
val connectionProperties = new Properties()
connectionProperties.put("user", MYSQL_USER)
connectionProperties.put("password", MYSQL_PASS)
val jdbcDF2 = spark.read
.jdbc(MYSQL_JDBC_URL, MYSQL_DB_TABLE, connectionProperties)
jdbcDF2.show()
connectionProperties.put("customSchema", "id DECIMAL(38, 0), name STRING")
val jdbcDF3 = spark.read
.jdbc(MYSQL_JDBC_URL, MYSQL_DB_TABLE, connectionProperties)
jdbcDF3.show()
}
}
Object 文件
package com.shockang.study.spark.core.read
import com.shockang.study.spark._
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/**
* 读取 object 文件
*
* @author Shockang
*/
object ReadObjectRDDExample {
case class Person(name: String, age: Int)
def main(args: Array[String]): Unit = {
// 初始化 SparkConf 对象,设置基本任务参数
val conf: SparkConf = new SparkConf()
// 设置提交任务的目标 Master 机器地址,local 为本地运行,[*]为自动分配任务线程数
.setMaster("local[*]")
// 设置任务名称
.setAppName("ReadObjectRDDExample")
// 实例化 SparkContext
val sc: SparkContext = new SparkContext(conf)
val filePath: String = READ_DATA_DIR + "people.object"
val rdd: RDD[Person] = sc.objectFile[Person](filePath)
printArray(rdd.collect)
sc.stop()
}
}
SequenceFile
package com.shockang.study.spark.core.read
import com.shockang.study.spark._
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/**
* 读取 SequenceFile
*
* @author Shockang
*/
object ReadSequenceRDDExample {
def main(args: Array[String]): Unit = {
// 初始化 SparkConf 对象,设置基本任务参数
val conf: SparkConf = new SparkConf()
// 设置提交任务的目标 Master 机器地址,local 为本地运行,[*]为自动分配任务线程数
.setMaster("local[*]")
// 设置任务名称
.setAppName("ReadSequenceRDDExample")
// 实例化 SparkContext
val sc: SparkContext = new SparkContext(conf)
val path: String = READ_DATA_DIR + "people.sequence"
// sequenceFile 应当使用 Hadoop 的 Writable 接口实现类,像 IntWritable 和 Text
// Spark 也允许使用一些原生类型,比如, sequenceFile[Int, String] 会自动转换成 IntWritable 和 Text
val rdd: RDD[(String, String)] = sc.sequenceFile[String, String](path)
printArray(rdd.collect)
sc.stop()
}
}
TXT
package com.shockang.study.spark.core.read
import com.shockang.study.spark._
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/**
* 读取 txt 文件
*
* @author Shockang
*/
object ReadTxtRDDExample {
def main(args: Array[String]): Unit = {
// 初始化 SparkConf 对象,设置基本任务参数
val conf: SparkConf = new SparkConf()
// 设置提交任务的目标 Master 机器地址,local 为本地运行,[*]为自动分配任务线程数
.setMaster("local[*]")
// 设置任务名称
.setAppName("ReadTxtRDDExample")
// 实例化 SparkContext
val sc: SparkContext = new SparkContext(conf)
sc.setLogLevel("ERROR")
// 读取文本文件,支持目录,压缩文件和通配符
val txtRDD: RDD[String] = sc.textFile(READ_DATA_DIR + "people.txt")
printArray(txtRDD.collect)
val dirRDD: RDD[String] = sc.textFile(READ_DATA_DIR)
printArray(dirRDD.collect)
// 默认支持 gzip 压缩
val gzipRDD: RDD[String] = sc.textFile(READ_DATA_DIR + "people.gz")
printArray(gzipRDD.collect)
val wildcardsRDD: RDD[String] = sc.textFile(READ_DATA_DIR + "*.txt")
printArray(wildcardsRDD.collect)
// 支持自定义分区数
val txtRDDWithMinPartitions: RDD[String] = sc.textFile(READ_DATA_DIR + "people.txt", 1)
printArray(txtRDDWithMinPartitions.collect)
// 读取目录下所有文本文件
val wholeTxtRDD: RDD[(String, String)] = sc.wholeTextFiles(READ_DATA_DIR)
// 结果日志输出,返回(文件名称,文件内容)对
printArray(wholeTxtRDD.collect)
// 读取目录下所有文本文件,同样支持自定义分区数
val wholeTxtRDDWithMinPartitions: RDD[(String, String)] = sc.wholeTextFiles(READ_DATA_DIR, 1)
// 结果日志输出,返回(文件名称,文件内容)对
printArray(wholeTxtRDDWithMinPartitions.collect)
// 停止sc,结束该任务
sc.stop()
}
}
package com.shockang.study.spark.sql.read
import com.shockang.study.spark.{READ_DATA_DIR, printArray}
import org.apache.spark.sql.SparkSession
/**
* 读取 txt 文件
*
* @author Shockang
*/
object ReadTxtDataFrameExample {
def main(args: Array[String]): Unit = {
// 使用 Spark 2.0 提供的 SparkSession API 来访问应用程序
val spark = SparkSession.builder().master("local[*]").appName("ReadTxtDataFrameExample").getOrCreate()
// 读取 txt 文件
val inputTextFile = spark.read.textFile(READ_DATA_DIR + "people.txt")
// 结果日志输出
printArray(inputTextFile.collect)
// 读取目录下所有 txt 文本文件
val allTextFile = spark.read.textFile(READ_DATA_DIR + "*.txt")
// 结果日志输出
printArray(allTextFile.collect)
// 停止sc,结束该任务
spark.stop()
}
}
写
CSV
package com.shockang.study.spark.core.write
import com.shockang.study.spark.WRITE_DATA_DIR
import com.shockang.study.spark.util.Utils.writableLocalFsPath
import org.apache.spark.{SparkConf, SparkContext}
object WriteCsvRDDExample {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setMaster("local[*]")
.setAppName("WriteCsvRDDExample")
val sc = new SparkContext(conf)
val array = Array("Thomas", 18, "male", "65kg", "180cm")
//转换成CSV格式保存
val csvRDD = sc.parallelize(Array(array.mkString(",")), 1)
val csvPath = WRITE_DATA_DIR + "WriteCsvRDDExample1"
csvRDD.saveAsTextFile(writableLocalFsPath(csvPath))
//转换成TSV格式保存
val tsvRDD = sc.parallelize(Array(array.mkString("\t")), 1)
val tsvPath = WRITE_DATA_DIR + "WriteCsvRDDExample2"
tsvRDD.saveAsTextFile(writableLocalFsPath(tsvPath))
sc.stop
}
}
package com.shockang.study.spark.sql.write
import com.shockang.study.spark.WRITE_DATA_DIR
import com.shockang.study.spark.util.Utils.writableLocalFsPath
import org.apache.spark.sql.SparkSession
/**
* 写入数据到 csv/tsv 文件
*
* @author Shockang
*/
object WriteCsvDataFrameExample {
def main(args: Array[String]): Unit = {
// 使用 Spark 2.0 提供的 SparkSession API 来访问应用程序
val spark = SparkSession.builder().master("local[*]").appName("UsingSqlWriteTxtExample").getOrCreate()
//转换成CSV格式保存
val df = spark.createDataFrame(List(("one", 1), ("two", 2), ("three", 3)))
val csvPath = WRITE_DATA_DIR + "WriteCsvDataFrameExample1"
df.write.csv(writableLocalFsPath(csvPath))
//转换成TSV格式保存
val tsvPath = WRITE_DATA_DIR + "WriteCsvDataFrameExample2"
df.write.option("delimiter", "\\t").csv(writableLocalFsPath(tsvPath))
spark.stop
}
}
Hadoop
package com.shockang.study.spark.core.write
import com.shockang.study.spark._
import org.apache.hadoop.io.{LongWritable, Text}
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object WriteHadoopRDDExample {
def main(args: Array[String]): Unit = {
// 初始化 SparkConf 对象,设置基本任务参数
val conf: SparkConf = new SparkConf()
// 设置提交任务的目标 Master 机器地址,local 为本地运行,[*]为自动分配任务线程数
.setMaster("local[*]")
// 设置任务名称
.setAppName("WriteHadoopRDDExample")
// 实例化 SparkContext
val sc: SparkContext = new SparkContext(conf)
val rdd: RDD[(String, String)] = sc.parallelize(Seq(("Michael", "29"), ("Andy", "30"), ("Justin", "19")), 1).cache()
// 使用 Hadoop 旧 API
val oldPath: String = HDFS_WRITE_DIR + "WriteHadoopRDDExampleWithOldAPI"
rdd.saveAsHadoopFile(oldPath, classOf[LongWritable], classOf[Text], classOf[org.apache.hadoop.mapred.TextOutputFormat[LongWritable, Text]])
// 使用 Hadoop 新 API
val newPath: String = HDFS_WRITE_DIR + "WriteHadoopRDDExampleWithNewAPI"
rdd.saveAsNewAPIHadoopFile(newPath, classOf[LongWritable], classOf[Text], classOf[TextOutputFormat[LongWritable, Text]])
sc.stop
}
}
JSON
package com.shockang.study.spark.core.write
import com.shockang.study.spark.WRITE_DATA_DIR
import com.shockang.study.spark.util.Utils.writableLocalFsPath
import org.apache.spark.{SparkConf, SparkContext}
import scala.util.parsing.json.{JSONArray, JSONObject}
object WriteJsonRDDExample {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setMaster("local[*]")
.setAppName("WriteJsonRDDExample")
val sc = new SparkContext(conf)
val map1 = Map("name" -> "Thomas", "age" -> "20", "address" -> JSONArray(List("通信地址1", "通信地址2")))
val map2 = Map("name" -> "Alice", "age" -> "18", "address" -> JSONArray(List("通信地址1", "通信地址2", "通信地址3")))
val rddData = sc.parallelize(List(JSONObject(map1), JSONObject(map2)), 1)
val tsvPath = WRITE_DATA_DIR + "WriteJsonRDDExample"
rddData.saveAsTextFile(writableLocalFsPath(tsvPath))
sc.stop()
}
}
package com.shockang.study.spark.sql.write
import com.shockang.study.spark._
import com.shockang.study.spark.util.Utils.writableLocalFsPath
import org.apache.spark.sql.SparkSession
/**
* 写入数据到 json 文件
*
* @author Shockang
*/
object WriteJsonDataFrameExample {
def main(args: Array[String]): Unit = {
// 使用 Spark 2.0 提供的 SparkSession API 来访问应用程序
val spark = SparkSession.builder().master("local[*]").appName("UsingSqlWriteTxtExample").getOrCreate()
//转换成CSV格式保存
val df = spark.createDataFrame(List(("one", 1), ("two", 2), ("three", 3)))
val csvPath = WRITE_DATA_DIR + "WriteCsvDataFrameExample1"
df.write.csv(writableLocalFsPath(csvPath))
}
}
MySQL
package com.shockang.study.spark.core.write
import com.shockang.study.spark.{MYSQL_JDBC_URL, MYSQL_PASS, MYSQL_USER}
import org.apache.spark.{SparkConf, SparkContext}
import java.sql.DriverManager
object WriteMysqlRDDExample {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setMaster("local[*]")
.setAppName("WriteMysqlRDDExample")
val sc = new SparkContext(conf)
Class.forName("com.mysql.jdbc.Driver")
val rddData = sc.parallelize(List(("Alice", 30), ("Kotlin", 37)))
rddData.foreachPartition((iter: Iterator[(String, Int)]) => {
val conn = DriverManager.getConnection(MYSQL_JDBC_URL, MYSQL_USER, MYSQL_PASS)
conn.setAutoCommit(false)
val preparedStatement = conn.prepareStatement("INSERT INTO spark_examples.person (`name`, `age`) VALUES (?, ?);")
iter.foreach(t => {
preparedStatement.setString(1, t._1)
preparedStatement.setInt(2, t._2)
preparedStatement.addBatch()
})
preparedStatement.executeBatch()
conn.commit()
conn.close()
})
sc.stop()
}
}
package com.shockang.study.spark.sql.write
import com.shockang.study.spark.{MYSQL_DB_TABLE, MYSQL_JDBC_URL, MYSQL_PASS, MYSQL_USER}
import org.apache.spark.sql.SparkSession
import java.util.Properties
/**
*
* @author Shockang
*/
object WriteMysqlDataFrameExample {
def main(args: Array[String]): Unit = {
// 使用 Spark 2.0 提供的 SparkSession API 来访问应用程序
val spark = SparkSession.builder().master("local[*]").appName("ReadCsvDataFrameExample").getOrCreate()
val df = spark.createDataFrame(List(
("Alice", "Female", "20"),
("Tom", "Male", "25"),
("Boris", "Male", "18"))).toDF("name", "sex", "age")
// Saving data to a JDBC source
df.write
.format("jdbc")
.option("url", MYSQL_JDBC_URL)
.option("dbtable", MYSQL_DB_TABLE)
.option("user", MYSQL_USER)
.option("password", MYSQL_PASS)
.save()
val connectionProperties = new Properties()
connectionProperties.put("user", MYSQL_USER)
connectionProperties.put("password", MYSQL_PASS)
df.write
.jdbc(MYSQL_JDBC_URL, MYSQL_DB_TABLE, connectionProperties)
df.write
.option("createTableColumnTypes", "name CHAR(64), comments VARCHAR(1024)")
.jdbc(MYSQL_JDBC_URL, MYSQL_DB_TABLE, connectionProperties)
}
}
Object 文件
package com.shockang.study.spark.core.write
import com.shockang.study.spark._
import com.shockang.study.spark.core.read.ReadObjectRDDExample.Person
import com.shockang.study.spark.util.Utils.writableLocalFsPath
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/**
*
* 写入数据到 object 文件
*
* @author Shockang
*/
object WriteObjectRDDExample {
def main(args: Array[String]): Unit = {
// 初始化 SparkConf 对象,设置基本任务参数
val conf: SparkConf = new SparkConf()
// 设置提交任务的目标 Master 机器地址,local 为本地运行,[*]为自动分配任务线程数
.setMaster("local[*]")
// 设置任务名称
.setAppName("WriteObjectRDDExample")
// 实例化 SparkContext
val sc: SparkContext = new SparkContext(conf)
val rdd: RDD[Person] = sc.parallelize(Seq(Person("小明", 20), Person("Alice", 18)), 1)
val filePath: String = WRITE_DATA_DIR + "WriteObjectRDDExample"
rdd.saveAsObjectFile(writableLocalFsPath(filePath))
sc.stop()
}
}
SequenceFile
package com.shockang.study.spark.core.write
import com.shockang.study.spark._
import com.shockang.study.spark.util.Utils.writableLocalFsPath
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/**
*
* 写入数据到 SequenceFile
*
* @author Shockang
*/
object WriteSequenceRDDExample {
def main(args: Array[String]): Unit = {
// 初始化 SparkConf 对象,设置基本任务参数
val conf: SparkConf = new SparkConf()
// 设置提交任务的目标 Master 机器地址,local 为本地运行,[*]为自动分配任务线程数
.setMaster("local[*]")
// 设置任务名称
.setAppName("WriteSequenceRDDExample")
// 实例化 SparkContext
val sc: SparkContext = new SparkContext(conf)
val rdd: RDD[(String, String)] = sc.parallelize(Seq(("姓名", "小明"), ("年龄", "18")), 1)
val path: String = WRITE_DATA_DIR + "WriteSequenceExample"
rdd.saveAsSequenceFile(writableLocalFsPath(path))
sc.stop()
}
}
TXT
package com.shockang.study.spark.core.write
import com.shockang.study.spark._
import com.shockang.study.spark.util.Utils.writableLocalFsPath
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/**
* 写入数据到 txt 文件
*
* @author Shockang
*/
object WriteTxtRDDExample {
def main(args: Array[String]): Unit = {
// 初始化 SparkConf 对象,设置基本任务参数
val conf: SparkConf = new SparkConf()
// 设置提交任务的目标 Master 机器地址,local 为本地运行,[*]为自动分配任务线程数
.setMaster("local[*]")
// 设置任务名称
.setAppName("WriteTxtRDDExample")
// 实例化 SparkContext
val sc: SparkContext = new SparkContext(conf)
val rdd: RDD[(String, Int)] = sc.parallelize(Array(("one", 1), ("two", 2), ("three", 3)), 1)
val filePath: String = WRITE_DATA_DIR + "WriteTxtRDDExample"
rdd.saveAsTextFile(writableLocalFsPath(filePath))
sc.stop
}
}
package com.shockang.study.spark.sql.write
import com.shockang.study.spark.WRITE_DATA_DIR
import com.shockang.study.spark.util.Utils.writableLocalFsPath
import org.apache.spark.sql.SparkSession
/**
* 写入数据到 txt 文件
*
* @author Shockang
*/
object WriteTxtDataFrameExample {
case class Person(name: String, sex: String, age: Int)
def main(args: Array[String]): Unit = {
// 使用 Spark 2.0 提供的 SparkSession API 来访问应用程序
val spark = SparkSession.builder().master("local[*]").appName("UsingSqlWriteTxtExample").getOrCreate()
// 注意隐式导入
import spark.implicits._
// 由于 txt 只支持 String 类型的数据,故需要经过转换
val df = spark.createDataFrame(List(("one", 1), ("two", 2), ("three", 3))).map(_.mkString("(", ", ", ")"))
val filePath = WRITE_DATA_DIR + "WriteTxtDataFrameExample"
df.write.text(writableLocalFsPath(filePath))
spark.stop
}
}
Parquet
package com.shockang.study.spark.sql.read
import com.shockang.study.spark._
import org.apache.spark.sql.SparkSession
/**
*
* @author Shockang
*/
object ParquetExample {
def main(args: Array[String]): Unit = {
// 使用 Spark 2.0 提供的 SparkSession API 来访问应用程序
val spark = SparkSession.builder().master("local[*]").appName("ParquetExample").getOrCreate()
import spark.implicits._
val peopleDF = spark.read.json(READ_DATA_DIR + "people.json")
peopleDF.write.parquet(WRITE_DATA_DIR + "people.parquet")
val parquetFileDF = spark.read.parquet(WRITE_DATA_DIR + "people.parquet")
parquetFileDF.createOrReplaceTempView("parquetFile")
val namesDF = spark.sql("SELECT name FROM parquetFile WHERE age BETWEEN 10 AND 20")
namesDF.map(attributes => "Name: " + attributes(0)).show()
}
}