前提说明
1.hive是安装在单台的namenode虚拟机上
2.mysql也是安装在单台的namenode虚拟机上
3.由于hive的表结构信息是存储在mysql中的,所以需要用到jdbc的jar文件,copy到spark的lib目录下(namenode和datanode都需要copy过去)
4.为了保障你的spark可以正常访问hive的数据,需要copy你的hive的配置文件hive-site.xml到spark的conf目录下
5.确保你的spark的配置文件spark-env.sh里面配置好
export SPARK_CLASSPATH=$SPARK_CLASSPATH:/usr/local/spark-1.5.1-bin-hadoop2.6/lib/mysql-connector-java-5.1.37-bin.jar
具体根据自己的环境进行配置
然后是spark查询hive将结果入mysql的操作的具体的代码,首先是build.sbt里面的内容
name := "SparkSqlToMysql"
version := "1.0"
scalaVersion := "2.10.4"
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % "1.5.1" % "provided",
"org.apache.spark" %% "spark-sql" % "1.5.1" % "provided",
"org.apache.spark" %% "spark-hive" % "1.5.1" % "provided",
"org.apache.spark" %% "spark-streaming" % "1.5.1" % "provided",
"org.apache.spark" %% "spark-streaming-kafka" % "1.5.1",
"mysql" % "mysql-connector-java" % "5.1.37",
"org.json4s" %% "json4s-native" % "3.2.10",
"org.json4s" %% "json4s-jackson" % "3.2.10"
)
mergeStrategy in assembly := {
case m if m.toLowerCase.endsWith("manifest.mf") => MergeStrategy.discard
case m if m.toLowerCase.matches("meta-inf.*\\.sf$") => MergeStrategy.discard
case "log4j.properties" => MergeStrategy.discard
case m if m.toLowerCase.startsWith("meta-inf/services/") => MergeStrategy.filterDistinctLines
case "reference.conf" => MergeStrategy.concat
case _ => MergeStrategy.first
}
name := "SparkSqlToMysql"
version := "1.0"
scalaVersion := "2.10.4"
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % "1.5.1" % "provided",
"org.apache.spark" %% "spark-sql" % "1.5.1" % "provided",
"org.apache.spark" %% "spark-hive" % "1.5.1" % "provided",
"org.apache.spark" %% "spark-streaming" % "1.5.1" % "provided",
"org.apache.spark" %% "spark-streaming-kafka" % "1.5.1",
"mysql" % "mysql-connector-java" % "5.1.37",
"org.json4s" %% "json4s-native" % "3.2.10",
"org.json4s" %% "json4s-jackson" % "3.2.10"
)
mergeStrategy in assembly := {
case m if m.toLowerCase.endsWith("manifest.mf") => MergeStrategy.discard
case m if m.toLowerCase.matches("meta-inf.*\\.sf$") => MergeStrategy.discard
case "log4j.properties" => MergeStrategy.discard
case m if m.toLowerCase.startsWith("meta-inf/services/") => MergeStrategy.filterDistinctLines
case "reference.conf" => MergeStrategy.concat
case _ => MergeStrategy.first
}
然后是对应的一个非常简单的查询hive的命令,然后把对应的结果插入到数据库的一个测试的例子程序
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql._
import org.apache.spark.sql.hive.HiveContext
import java.sql.{Connection, DriverManager, ResultSet};
object HiveFromSparkToMysql {
def main(args: Array[String]) {
val sparkConf = new SparkConf().setAppName("HiveFromSparkToMysql")
val sc = new SparkContext(sparkConf)
val hiveContext = new HiveContext(sc)
import hiveContext.implicits._
import hiveContext.sql
//sql("SELECT name,age FROM wyp").collect().foreach(insertDB)
sql("SELECT name,age FROM wyp where age>10").collect().foreach(rs => insertDB(rs.getString(0), rs.getInt(1)))
sc.stop()
}
def insertDB(name: String, age: Int) {
val url = "jdbc:mysql://namenode/test"
val username = "root"
val password = "rainboy"
val driver = "com.mysql.jdbc.Driver"
var conn:Connection = null
try {
Class.forName(driver)
conn = DriverManager.getConnection(url, username, password)
val statement = conn.createStatement(ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_UPDATABLE)
val prep = conn.prepareStatement("INSERT INTO user (name, age) VALUES (?, ?)")
prep.setString(1, name)
prep.setInt(2, age)
prep.executeUpdate
}
catch {
case e : Throwable => e.printStackTrace
}
finally {
conn.close
}
}
}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql._
import org.apache.spark.sql.hive.HiveContext
import java.sql.{Connection, DriverManager, ResultSet};
object HiveFromSparkToMysql {
def main(args: Array[String]) {
val sparkConf = new SparkConf().setAppName("HiveFromSparkToMysql")
val sc = new SparkContext(sparkConf)
val hiveContext = new HiveContext(sc)
import hiveContext.implicits._
import hiveContext.sql
//sql("SELECT name,age FROM wyp").collect().foreach(insertDB)
sql("SELECT name,age FROM wyp where age>10").collect().foreach(rs => insertDB(rs.getString(0), rs.getInt(1)))
sc.stop()
}
def insertDB(name: String, age: Int) {
val url = "jdbc:mysql://namenode/test"
val username = "root"
val password = "rainboy"
val driver = "com.mysql.jdbc.Driver"
var conn:Connection = null
try {
Class.forName(driver)
conn = DriverManager.getConnection(url, username, password)
val statement = conn.createStatement(ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_UPDATABLE)
val prep = conn.prepareStatement("INSERT INTO user (name, age) VALUES (?, ?)")
prep.setString(1, name)
prep.setInt(2, age)
prep.executeUpdate
}
catch {
case e : Throwable => e.printStackTrace
}
finally {
conn.close
}
}
}
对应的一些表结构信息具体如下,在数据库test中创建表
CREATE TABLE USER(
id INT(11) PRIMARY KEY NOT NULL AUTO_INCREMENT,
name VARCHAR(200) DEFAULT NULL,
age BIGINT(20) DEFAULT 0
);
create table user(
id int(11) primary key not null auto_increment,
name varchar(200) default null,
age bigint(20) default 0
);
其他的对应的hive的表结构信息及内容,这里就不列举了,后面附上对应的执行提交到分布式系统中的命令
/usr/local/spark-1.5.1-bin-hadoop2.6/bin/spark-submit –class HiveFromSparkToMysql –master spark://namenode:7077 –executor-memory 1g /home/hadoop/test_sparktomysql/target/scala-2.10/SparkSqlToMysql-assembly-1.0.jar
最后附上一个完整的测试例子代码,含目录结构test_sparktomysql.tar
做一个简单的备注笔记