MySQL
mysql
- 读取
//读取MySQL数据表
spark.read.format("jdbc")
.option("user","root")
.option("password","passwd")
.option("driver","com.mysql.jdbc.Driver")
.option("url","jdbc:mysql://localhost/shtd_store?characterEncoding=UTF-8&useSSL=false")
.option("dbtable","base_region").load
- 写入
//如果表未创建则自动建表
val prop = new Properties()
prop.setProperty("user", "root")
prop.setProperty("password", "passwd")
prop.setProperty("driver", "com.mysql.jdbc.Driver")
//将DataFrame保存到MySQL shtd_industry库的machine_running_compare表
val url = "jdbc:mysql://localhost:3306/shtd_industry?characterEncoding=UTF-8&useSSL=false"
df.write.mode(SaveMode.Overwrite).jdbc(url, "machine_running_compare", prop)
常见问题1(数据库连接超时) 产生的原因:应用方的数据库连接有效期时间,大于数据库自己设置的有效期。
The last packet successfully received from the server was 396 milliseconds ago. The last packet sent successfully to the server was 388 milliseconds ago.
解决方法:修改url &useSSL=false
jdbc:mysql://localhost:3306/test?characterEncoding=UTF-8&useSSL=false
常见问题2(Public Key Retrieval is not allowed 异常)
解决方法:修改url &allowPublicKeyRetrieval
jdbc:mysql://localhost:3306/test?characterEncoding=UTF-8&useSSL=false&allowPublicKeyRetrieval=true
或者是更新MySQL mysql-connector-java 连接驱动的版本
jdbc:mysql://localhost:3306/stud_store_ads_zj?characterEncoding=UTF-8&useSSL=false
mysql 8.0 url配置
在连接数据库配置都没错,jar包也有,但是还是报
Could not create connection to database server.错误,那是因为mysql8.0版本的使用的数据库连接字符串不一样,而且还对时区有要求,引用下面的连接即可
url: jdbc:mysql://localhost:3306/my_db1?characterEncoding=utf8&useSSL=false&serverTimezone=UTC&rewriteBatchedStatements=true
driver: com.mysql.cj.jdbc.Driver
Hive
//读取hive表 库名.表名
spark.table("dwd.dim_customer")
//写入 如果表未创建则自动建表
//将DataFrame保存到Hive dws库的machine_running_compare
df.write.partitionBy("year")
//partitionBy 可以设置分区表分区字段
.mode(SaveMode.Overwrite)
.saveAsTable("dws.machine_running_compare")
CSV
//读取
spark.read.format("csv")
.option("sep", ",") //分隔符设置
.option("header", "true") //首行是列名,即剔除首行
.option("inferSchema", "true") //表结构
.load("data/data_hotel.csv") //文件路径
//写入
//方法一
df.coalesce(1).write.option("header","true").option("sep",",").csv("sample_file.csv")
//方法二
df.write.format("com.databricks.spark.csv").option("header","true").save("myFile.csv")
Excel
<!--读写excel文件所需的pom依赖-->
<dependency>
<groupId>com.crealytics</groupId>
<artifactId>spark-excel_2.12</artifactId>
<version>0.14.0</version>
</dependency>
import java.util.Properties
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.{SparkConf, SparkContext}
object test01 {
def main(args: Array[String]): Unit = {
val inpath = "D:\\Project102\\大数据技术学生追踪分析\\大数据技术201302\\大数据201302第二学年获奖情况.xlsx"
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("test")
val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
val sc: SparkContext = spark.sparkContext
import spark.implicits._
//SparkSQL读取excel文件
var schema: StructType = StructType(List(
StructField("stu_id", StringType, nullable = false),
StructField("name", StringType, nullable = false),
StructField("award", StringType, nullable = false),
))
val df: DataFrame = spark.read.format("com.crealytics.spark.excel")
.option("dataAddress", "'Sheet1'!A2:C35")
.option("Header", "false")
.schema(schema)
.load(inpath)
/*
dataAddress 读取excel数据的范围
header 是否读取表头
*/
df.createOrReplaceTempView("t1")
// df.show()
spark.sql(
"""
|select stu_id,name,award1 from t1
|lateral view explode(split(award,"\\s+")) tmpTable as award1
|""".stripMargin).show(100, truncate = false)
val result_df: DataFrame = spark.sql(
"""
|select stu_id,name,award1 from t1
|lateral view explode(split(award,"\\s+")) tmpTable as award1
|""".stripMargin)
val prop = new Properties()
prop.setProperty("user", "root")
prop.setProperty("password", "passwd")
prop.setProperty("driver", "com.mysql.jdbc.Driver")
result_df.write.mode(SaveMode.Overwrite).jdbc("jdbc:mysql://127.0.0.1:3306/test01?characterEncoding=UTF-8&useSSL=false", "student_award", prop)
sc.stop()
spark.close()
}
}
clickhouse
官方驱动
<dependency>
<groupId>ru.yandex.clickhouse</groupId>
<artifactId>clickhouse-jdbc</artifactId>
<version>0.3.2</version>
</dependency>
val driver = "ru.yandex.clickhouse.ClickHouseDriver"
val url_clickhouse = "jdbc:clickhouse://master:8123"
val properties = new Properties()
properties.setProperty("user", "default")
properties.setProperty("password", "passwd")
properties.setProperty("driver", driver)
//必须创建表结构
//create table my_db1.base_region(id UInt32, region_name String) ENGINE=MergeTree() order by id;
spark.read.table("tmp.base_region").write.mode(SaveMode.Append)
// .option("batchsize", "20000")
// .option("isolationLevel", "NONE")
// .option("numPartitions", "1")
.jdbc(url_clickhouse, "my_db1.base_region", properties)
第三方驱动
<!--clickhouse 第三方驱动 使用默认端口9000-->
<dependency>
<groupId>com.github.housepower</groupId>
<artifactId>clickhouse-native-jdbc</artifactId>
<version>2.6.5</version>
</dependency>
//读取clickhouse表
val tablename = s"(select * from table_op where LocationTime between '$start_time' and '$end_time') temp"
val readDataDf = spark.read
.format("jdbc")
.option("url", "jdbc:clickhouse://master:9000/my_db1")
.option("fetchsize", "500000")
.option("driver", "com.github.housepower.jdbc.ClickHouseDriver")
.option("user", "default")
.option("password", "passwd")
.option("dbtable", tablename)
.load()
dataFrame.write.format("jdbc")
.option("driver", "com.github.housepower.jdbc.ClickHouseDriver")
// rewriteBatchedStatements是批量提交,视情况选择
.option("url", "jdbc:clickhouse://10.32.30.248:9000/my_db1?rewriteBatchedStatements=true")
.option("batchsize", 10000)
.option("user","default")
.option("password","passwd")
.option("dbtable","base_province")
//写入模式为Overwrite需增加该参,SparkSQL官网有详细参数解释
.option("truncate","true")
.option("createTableOptions","ENGINE=Log()")
.option("isolationLevel","NONE")
.mode(SaveMode.Overwrite)
.save()