spark sql 学习中的几点记录

1.spark sql 连接hive
可以直接使用 org.apache.spark.sql.hive.HiveContext,这个是最简单的,一般都是查询统计,不涉及到修改

2.spark sql 连接 mysql
Spark SQL可以通过JDBC从关系型数据库中读取数据的方式创建DataFrame,通过对DataFrame一系列的计算后,还可以将数据再写回关系型数据库中。

启动Spark Shell

/usr/local/spark-1.5.2-bin-hadoop2.6/bin/spark-shell \
--master spark://node1.itcast.cn:7077 \
--jars /usr/local/spark-1.5.2-bin-hadoop2.6/mysql-connector-java-5.1.35-bin.jar \
--driver-class-path /usr/local/spark-1.5.2-bin-hadoop2.6/mysql-connector-java-5.1.35-bin.jar

从mysql中加载数据,查询

val jdbcDF = sqlContext.read.format("jdbc").options(Map("url" -> "jdbc:mysql://192.168.10.1:3306/bigdata", "driver" -> "com.mysql.jdbc.Driver", "dbtable" -> "person", "user" -> "root", "password" -> "123456")).load()

//jdbcDF.show()

val results = sqlContext.sql("SELECT * FROM people")
results.map(t => "Name: " + t(0)).collect().foreach(println)

将数据写入到MySQL中

import java.util.Properties
import org.apache.spark.sql.{SQLContext, Row}
import org.apache.spark.sql.types.{StringType, IntegerType, StructField, StructType}
import org.apache.spark.{SparkConf, SparkContext}

object JdbcRDD {
  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("MySQL-Demo")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)
    //通过并行化创建RDD
    val personRDD = sc.parallelize(Array("1 tom 5", "2 jerry 3", "3 kitty 6")).map(_.split(" "))
    //通过StructType直接指定每个字段的schema
    val schema = StructType(
      List(
        StructField("id", IntegerType, true),
        StructField("name", StringType, true),
        StructField("age", IntegerType, true)
      )
    )
    //将RDD映射到rowRDD
    val rowRDD = personRDD.map(p => Row(p(0).toInt, p(1).trim, p(2).toInt))
    //将schema信息应用到rowRDD上
    val personDataFrame = sqlContext.createDataFrame(rowRDD, schema)
    //创建Properties存储数据库相关属性
    val prop = new Properties()
    prop.put("user", "root")
    prop.put("password", "123456")
    //将数据追加到数据库
    personDataFrame.write.mode("append").jdbc("jdbc:mysql://192.168.10.1:3306/bigdata", "bigdata.person", prop)
    //停止SparkContext
    sc.stop()
  }
}

3.spark 连接hbase
(可以通过hive创建外部表连接hbase)

import org.apache.hadoop.fs.Path
import org.apache.hadoop.hbase.{ HBaseConfiguration, HColumnDescriptor, HTableDescriptor }
import org.apache.hadoop.hbase.client.{ HBaseAdmin, HTable, Put }
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql._
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.datasources.hbase._
import org.apache.hadoop.hbase.spark.datasources.HBaseScanPartition
import org.apache.hadoop.hbase.util.Bytes

case class HBaseRecord(
  col0: String,
  col1: Int)

object HBaseRecord {
  def apply(i: Int, t: Int): HBaseRecord = {
    val s = s"""row${"%03d".format(i)}"""
    HBaseRecord(s,
      i)
  }
}

object Test {
  def main(args: Array[String]) {

    val conf = new SparkConf().setAppName("test spark sql");
    conf.setMaster("spark://master:7077");
    val sc = new SparkContext("local", "test") //new SparkContext(conf)//
    val config = HBaseConfiguration.create()
    //config.addResource("/home/hadoop/hbase-1.2.2/conf/hbase-site.xml");
    //config.set("hbase.zookeeper.quorum", "node1,node2,node3");
    val hbaseContext = new HBaseContext(sc, config, null)

    def catalog = s"""{
       |"table":{"namespace":"default", "name":"table4"},
       |"rowkey":"key",
       |"columns":{
         |"col0":{"cf":"rowkey", "col":"key", "type":"string"},
         |"col1":{"cf":"cf1", "col":"col1", "type":"int"}
       |}
     |}""".stripMargin

    val sqlContext = new SQLContext(sc);
    import sqlContext.implicits._

    def withCatalog(cat: String): DataFrame = {
      sqlContext
        .read
        .options(Map(HBaseTableCatalog.tableCatalog -> cat))
        .format("org.apache.hadoop.hbase.spark")
        .load()
    }
    val df = withCatalog(catalog)

    val res = df.select("col1")
    //res.save("hdfs://master:9000/user/yang/a.txt")
    res.show()
    df.registerTempTable("table4")
    sqlContext.sql("select count(col0),sum(col1) from table4 where col1>'20' and col1<'26' ").show
    println("-----------------------------------------------------");
    sqlContext.sql("select count(col1),avg(col1) from table4").show
  }
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值