一、spark读取csv文件(四种方式)
//方式一:直接使用csv方法
val sales4: DataFrame = spark.read.option("header", "true").option("header", false).csv("file:///D:\\Software\\idea_space\\spark_streaming\\src\\data\\exam\\sales.csv")
.withColumnRenamed("_c0", "time")
.withColumnRenamed("_c1", "id")
.withColumnRenamed("_c2", "salary")
.withColumn("salary", $"salary".cast("Double"))
//方式二:使用format
val salesDF3 = spark.read.format("com.databricks.spark.csv")
//header默认false头不设置成字段名,true是第一行作为数据表的字段名
// .option("header", "true")
//自动类型推断
// .option("inferSchema", true)
.load("D:\\Software\\idea_space\\spark_streaming\\src\\data\\exam\\sales.csv")
//字段重命名
.withColumnRenamed("_c0", "time")
.withColumnRenamed("_c1", "id")
.withColumnRenamed("_c2", "salary")
//字段重命名后修改类型
.withColumn("salary", $"salary".cast("Double"))
//方式三:通过算子将数组转换成样例类对象
private val salesDF: DataFrame = sc.textFile("file:///D:\\Software\\idea_space\\spark_streaming\\src\\data\\exam\\sales.csv")
.map(_.split(",")).map(x => Sales(x(0), x(1), x(2).toDouble)).toDF()
case class Sales(time:String,id:String,salary:Double)
//方式四:通过schema创建
val userRDD:RDD[Row] = sc.textFile("file:///D:\\Software\\idea_space\\spark_streaming\\src\\data\\exam\\demo02.txt").map(_.split(","))
.map(x => Row(x(0), x(1), x(2).toInt))
val schema = StructType(Array(
StructField("username", StringType, true),
StructField("month", StringType, true),
StructField("visitNum", IntegerType, true)
))
val userDF: DataFrame = spark.createDataFrame(userRDD,schema)
二、spark读取json文件
//方式一:
private val userJsonDF: DataFrame = spark.read.json("file:///D:\\Software\\idea_space\\spark_streaming\\src\\data\\exam\\users.json")
//方式二:
private val userJsonDF: DataFrame = spark.read.format("json").load("D:\\Software\\idea_space\\spark_streaming\\src\\data\\exam\\users.json")
三、spark读取mysql及写入mysql
1.spark读取mysql数据
val url = "jdbc:mysql://hadoop1:3306/test"
val tableName = "sales"
private val prop = new Properties()
prop.setProperty("user","root")
prop.setProperty("password","ok")
prop.setProperty("driver","com.mysql.jdbc.Driver")
private val salesDF2: DataFrame = spark.read.jdbc(url,tableName,prop)
2.将DF写入到mysql中
val url = "jdbc:mysql://hadoop1:3306/test"
val tableName = "sales"
private val prop = new Properties()
prop.setProperty("user","root")
prop.setProperty("password","ok")
prop.setProperty("driver","com.mysql.jdbc.Driver")
//mode的几种方式:overwrite覆盖,append追加
salesDF.write.mode("overwrite").jdbc(url,tableName,prop)
四、spark 操作hive中的数据表
Spark SQL与Hive集成:
1、hive-site.xml拷贝至${SPARK_HOME}/conf下
2、检查hive-site.xml中的hive.metastore.uris是否正确
<property>
<name>hive.metastore.uris</name>
<value>thrift://hadoop1:9083</value>
</property>
3、后台启动元数据服务:nohup hive --service metastore &
4、启动spark-shell
//集成Hive后spark-shell下可直接访问Hive表
val df=spark.table("toronto")
df.printSchema
df.show
效果图:
在idea中使用,需将hive-site.xml拷贝至resources
导入hive依赖
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>${spark.version}</version>
<!--<scope>provided</scope>-->
</dependency>
//IDEA中使用,需将hive-site.xml拷贝至resources
private val spark: SparkSession = SparkSession.builder().master("local[*]")
//支持hive
.enableHiveSupport()
.getOrCreate()
val df = spark.sql("select * from toronto")
df.show()
df.printSchema()