spark操作hive
-
首先需要将hive/conf目录下的hive-site.xml文件拷贝到spark的conf目录下
cp /opt/hive/conf/hive-site.xml /opt/spark245/conf
-
检查hive.metastore.uris是否正确(TODO:未配置完成,使用idea连接失败)
<property> <name>hive.metastore.uris</name> <value>thrift://192.168.237.100:9083</value> <description>指向的是运行metastore服务的主机</description> </property>
-
后台启动hive的metastore元数据服务
nohup hive --service metastore & 后台运行hive的metastore 在阻塞窗口按下回车
-
添加依赖:
<dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-hive_2.11</artifactId> <version>2.1.1</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-sql_2.11</artifactId> <version>2.1.1</version> </dependency>
-
spark实行代码
import org.apache.spark.sql.{DataFrame, SparkSession} object SparkSqlOnHiveDemo { def main(args: Array[String]): Unit = { val spark: SparkSession = SparkSession.builder() .master("local") .appName("sparksqlonhivedemo") .config("hive.metastore.uris","thrift://192.168.126.100:9083") .enableHiveSupport() .getOrCreate() import spark.implicits._ //spark 默认连接 hive default库 //连接其他库请使用 库名.表名 // val frame: DataFrame = spark.sql("show databases") // frame.show() val df: DataFrame = spark.sql("select * from toronto") df.where("ssn like '111%'").show() df.where(df("ssn").startsWith("111")).show() df.filter(df("ssn").startsWith("111")).show() // df.printSchema() // df.show } }
spark操作mysql数据库
-
添加依赖:
<dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>5.1.38</version> </dependency>
-
使用idea连接
import java.util.Properties import org.apache.spark.sql.{DataFrame, SparkSession} object SparkSqlOnMysqlDemo { def main(args: Array[String]): Unit = { val spark: SparkSession = SparkSession.builder().master("local").appName("sparksqlmysqldemo").getOrCreate() import spark.implicits._ val url = "jdbc:mysql://192.168.126.100:3306/hive" val prop = new Properties() prop.setProperty("user","root") prop.setProperty("password","ok") prop.setProperty("driver","com.mysql.jdbc.Driver") val df: DataFrame = spark.read.jdbc(url,"TBLS",prop) // df.printSchema() // df.show() // println(df.where(df("CREATE_TIME").startsWith("160")).count()) df.groupBy(df("DB_ID")).count().show() import org.apache.spark.sql.functions._ } }