1.spark sql 连接hive
可以直接使用 org.apache.spark.sql.hive.HiveContext,这个是最简单的,一般都是查询统计,不涉及到修改
2.spark sql 连接 mysql
Spark SQL可以通过JDBC从关系型数据库中读取数据的方式创建DataFrame,通过对DataFrame一系列的计算后,还可以将数据再写回关系型数据库中。
启动Spark Shell
/usr/local/spark-1.5.2-bin-hadoop2.6/bin/spark-shell \
--master spark://node1.itcast.cn:7077 \
--jars /usr/local/spark-1.5.2-bin-hadoop2.6/mysql-connector-java-5.1.35-bin.jar \
--driver-class-path /usr/local/spark-1.5.2-bin-hadoop2.6/mysql-connector-java-5.1.35-bin.jar
从mysql中加载数据,查询
val jdbcDF = sqlContext.read.format("jdbc").options(Map("url" -> "jdbc:mysql://192.168.10.1:3306/bigdata", "driver" -> "com.mysql.jdbc.Driver", "dbtable" -> "person", "user" -> "root", "password" -> "123456")).load()
//jdbcDF.show()
val results = sqlContext.sql("SELECT * FROM people")
results.map(t => "Name: " + t(0)).collect().foreach(println)
将数据写入到MySQL中
import java.util.Properties
import org.apache.spark.sql.{SQLContext, Row}
import org.apache.spark.sql.types.{StringType, IntegerType, StructField, StructType}
import org.apache.spark.{SparkConf, SparkContext}
object JdbcRDD {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("MySQL-Demo")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
//通过并行化创建RDD
val personRDD = sc.parallelize(Array("1 tom 5", "2 jerry 3", "3 kitty 6")).map(_.split(" "))
//通过StructType直接指定每个字段的schema
val schema = StructType(
List(
StructField("id", IntegerType, true),
StructField("name", StringType, true),
StructField("age", IntegerType, true)
)
)
//将RDD映射到rowRDD
val rowRDD = personRDD.map(p => Row(p(0).toInt, p(1).trim, p(2).toInt))
//将schema信息应用到rowRDD上
val personDataFrame = sqlContext.createDataFrame(rowRDD, schema)
//创建Properties存储数据库相关属性
val prop = new Properties()
prop.put("user", "root")
prop.put("password", "123456")
//将数据追加到数据库
personDataFrame.write.mode("append").jdbc("jdbc:mysql://192.168.10.1:3306/bigdata", "bigdata.person", prop)
//停止SparkContext
sc.stop()
}
}
3.spark 连接hbase
(可以通过hive创建外部表连接hbase)
import org.apache.hadoop.fs.Path
import org.apache.hadoop.hbase.{ HBaseConfiguration, HColumnDescriptor, HTableDescriptor }
import org.apache.hadoop.hbase.client.{ HBaseAdmin, HTable, Put }
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql._
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.datasources.hbase._
import org.apache.hadoop.hbase.spark.datasources.HBaseScanPartition
import org.apache.hadoop.hbase.util.Bytes
case class HBaseRecord(
col0: String,
col1: Int)
object HBaseRecord {
def apply(i: Int, t: Int): HBaseRecord = {
val s = s"""row${"%03d".format(i)}"""
HBaseRecord(s,
i)
}
}
object Test {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("test spark sql");
conf.setMaster("spark://master:7077");
val sc = new SparkContext("local", "test") //new SparkContext(conf)//
val config = HBaseConfiguration.create()
//config.addResource("/home/hadoop/hbase-1.2.2/conf/hbase-site.xml");
//config.set("hbase.zookeeper.quorum", "node1,node2,node3");
val hbaseContext = new HBaseContext(sc, config, null)
def catalog = s"""{
|"table":{"namespace":"default", "name":"table4"},
|"rowkey":"key",
|"columns":{
|"col0":{"cf":"rowkey", "col":"key", "type":"string"},
|"col1":{"cf":"cf1", "col":"col1", "type":"int"}
|}
|}""".stripMargin
val sqlContext = new SQLContext(sc);
import sqlContext.implicits._
def withCatalog(cat: String): DataFrame = {
sqlContext
.read
.options(Map(HBaseTableCatalog.tableCatalog -> cat))
.format("org.apache.hadoop.hbase.spark")
.load()
}
val df = withCatalog(catalog)
val res = df.select("col1")
//res.save("hdfs://master:9000/user/yang/a.txt")
res.show()
df.registerTempTable("table4")
sqlContext.sql("select count(col0),sum(col1) from table4 where col1>'20' and col1<'26' ").show
println("-----------------------------------------------------");
sqlContext.sql("select count(col1),avg(col1) from table4").show
}
}