Spark 操作 Mysql
创建工程
选择spark的maven工程
依赖(参考)
先修改scala的版本为2.12.10
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.12</artifactId>
<version>3.1.2</version>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.48</version>
</dependency>
<dependency>
<groupId>org.specs</groupId>
<artifactId>specs</artifactId>
<version>1.2.5</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-jdbc</artifactId>
<version>3.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.12</artifactId>
<version>3.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.12</artifactId>
<version>3.1.2</version>
</dependency>
创建resource并添加log4j
log4j.rootLogger=ERROR,C,R
log4j.appender.C=org.apache.log4j.ConsoleAppender
log4j.appender.C.layout=org.apache.log4j.PatternLayout
log4j.appender.c.layout.ConversionPattern=%-d{yyyy-MM-dd HH:mm:ss} [%c]-[%p] %m%n
log4j.appender.R=org.apache.log4j.RollingFileAppender
log4j.appender.R.Encoding=UTF-8
log4j.appender.R.File=user_act_log/prolog.log
log4j.appender.R.MaxFileSize=128MB
log4j.appender.R.MaxBackupIndex=40
log4j.appender.R.layout=org.apache.log4j.PatternLayout
log4j.appender.R.layout.ConversionPattern=%-d{yyyy-MM-dd HH:mm:ss} [%c]-[%p] %m%n
log4j.logger.org.apache.spark.repl.Main=WARN
# Settings to quiet third party logs that are too verbose
log4j.logger.org.spark_project.jetty=WARN
log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR
log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=ERROR
log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=ERROR
log4j.logger.org.apache.parquet=ERROR
log4j.logger.parquet=ERROR
# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
代码如下:
/**
- 操作数据库分为两种类型
- 1、基于sql语句 spark.sql(“select …”).show|save
- 2、基于dataset|dataframe的sql算子 --必须导入两个包
- sql 函数(如:聚合函数,窗口函数,window子句
- import org.apache.spark.sql.functions._
- spark session 的隐式转换包,需要先创建sparkSession对象
- import spark.implicits._
*/
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder()
.master("local[*]").appName("spark_sql_01")
.getOrCreate()
val URL = "jdbc:mysql://192.168.6.130:3306/test?useSSL=false&characterEncoding=utf8"
val TABLE = "Employer"
val PRO = new Properties()
//设置要访问的用户名,密码,及driver
PRO.setProperty("dirver","com.mysql.jdbc.Driver")
PRO.setProperty("user","root")
PRO.setProperty("password","12345678")
//通过spark.read.jdbc方法读取mysql中的数据
val frame:DataFrame = spark.read.jdbc(URL,TABLE,PRO)
spark.read.jdbc(URL,TABLE,PRO).createTempView("Employer")
//查询
//frame.show()
//通过append追加方式,更新原表-----复制一份数据追加在原表后面
//frame.write.mode("append").jdbc(URL,TABLE,PRO)
//通过sql语句来操作
/*spark.sql(
"""
|select * from Employer
|
|""".stripMargin)
// .printSchema()
// .show()//show只能显示20条数据
.select("*")
.toJavaRDD
.saveAsTextFile("")*/
//spark session 的隐式转换包
import spark.implicits._
//字段:"name" 提取字段的值:$"name"
/*frame.select($"name",concat_ws(",",$"name",$"age").as("name_age"))
// .where($"age".between(0,22))
// .where($"age".equalTo(23))//eq|gt|lt|le|ge|ne
// .where($"age"===23)
// .where($"name".equalTo("dai"))
// .where($"age".like("%_3"))//模糊查询
// .where($"age".isNull)
// .where($"age".isNotNull)
// .where($"age".isNaN)//是否是数字
// .where($"age".isin(3,22))
// .where($"age".geq(22))
//逻辑运算符
// .where(not($"age".gt(4) and $"name"==="dai"))
.where(not($"age".gt(4) or $"name"==="dai"))
.show()*/
spark.close()
}