spark sql: 操作hbase表——rdd实现sql查询

  • hbase数据:(需求–按年龄分组,求出各年龄段的总人数,计算结果保存mysql)
hbase(main):006:0> scan 'p1'
ROW        COLUMN+CELL                                                                                               
 r1        column=f:age, timestamp=1556181658918, value=23                                                           
 r1        column=f:name, timestamp=1556181620870, value=lisi                                                        
 r2        column=f:age, timestamp=1556181665529, value=25                                                                              
 r2        column=f:name, timestamp=1556181630233, value=lisi2                                                       
 r3        column=f:age, timestamp=1556181671939, value=34                                                           
 r3        column=f:name, timestamp=1556181639913, value=lisi3                                                       
 r4        column=f:age, timestamp=1557247762486, value=25                                                           
 r4        column=f:name, timestamp=1557247747800, value=a1    

1,spark-sql查询hbase

I,准备sql语句

select count(*),age from person  group by age

II,使用spark-sql执行sql语句

a,maven配置

<repositories>
   <repository>
        <id>cloudera</id>
        <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
    </repository>
</repositories>

<dependencies>
    <dependency>
        <groupId>org.apache.hbase</groupId>
        <artifactId>hbase-client</artifactId>
        <version>1.2.0</version>
    </dependency>
    <dependency>
        <groupId>org.apache.hbase</groupId>
        <artifactId>hbase-server</artifactId>
        <version>1.2.0</version>
    </dependency>

    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-core_2.10</artifactId>
        <version>1.5.0-cdh5.6.1</version>
    </dependency>
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-sql_2.10</artifactId>
        <version>1.5.0-cdh5.6.1</version>
    </dependency>

    <dependency>
        <groupId>mysql</groupId>
        <artifactId>mysql-connector-java</artifactId>
        <version>5.1.17</version>
</dependency>

b,spark-sql执行sql语句

import java.util.Properties
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}

object SparkSqlHbase2_write_mysql {
  def main(args: Array[String]): Unit = {
    //创建spark配置
    val conf = new SparkConf()
    conf.setMaster("local[*]").setAppName("test")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)

    //读取hbase数据
    val hbaseConf = HBaseConfiguration.create()
    hbaseConf.set(TableInputFormat.INPUT_TABLE, "p1")
    val hbaseRDD = sc.newAPIHadoopRDD(hbaseConf,
      classOf[TableInputFormat],
      classOf[ImmutableBytesWritable],
      classOf[Result]
    )

    //rdd-->df
    val tupRDD = hbaseRDD.map({
      case (_, result) => {
        val name = Bytes.toString(result.getValue(Bytes.toBytes("f"), Bytes.toBytes("name")))
        val age = Bytes.toString(result.getValue(Bytes.toBytes("f"), Bytes.toBytes("age")))

        // 转换成RDD[Row]
        (name,age)
      }
    })

    //构造schema, rdd->转成DataFrame, 注册表
    import sqlContext.implicits._
    val df = tupRDD.toDF("name","age")
    df.registerTempTable("person")

    val df2 = sqlContext.sql("select name,age from person")
    df2.show()

    //数据转存到mysql: test.person
    val prop = new Properties()
    prop.put("user", "root")
    prop.put("password", "123456")
    df2.write.mode("append").jdbc("jdbc:mysql://localhost:3306/test","person",prop)
  }

}

2, rdd实现hbase sql查询

I模拟数据源1:mysql

val dfreader=sqlContext.read.format("jdbc")
.option("url", "jdbc:mysql://localhost:3306/test")
.option("driver","com.mysql.jdbc.Driver")
.option("dbtable", "person")
.option("user", "root")
.option("password", "123456")

val df = dfreader.load()
df.show() //打印数据

II, 模拟数据源:textfile

wang@controller:~$ cat a.txt 
a,23,java
a1,23,java
a2,23,java
a3,23,java
b,23,py
c,45,go
d,34,go
###rdd编程
 sc.setLogLevel("ERROR")
 val rdd = sc.textFile("file:///home/wang/a.txt")
 val rdd2=rdd.map(str=>{
   val infos = str.split(",")
   (infos(2),(infos.mkString(","),1))
 })

 val rdd3 = rdd2.reduceByKey((tup1,tup2)=>{
   val key1=tup1._1
   val key2=tup2._1
   val extrMesg=key1+";"+key2
   val count= tup1._2+tup2._2

   (extrMesg,count)
 })
 
 rdd3.foreach(tup=>{
   val key = tup._1
   val tup_arr = tup._2
   val infostr = tup_arr._1
   val count = tup_arr._2


   println(key+"->"+count+"|"+infostr)
// py->1|b,23,py
// java->4|a,23,java;a1,23,java;a2,23,java;a3,23,java
 })

III 真实数据源:hbase

import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.{SparkConf, SparkContext}

object Count3 {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local[*]").setAppName("test")
    val sc = new SparkContext(conf)
    sc.setLogLevel("ERROR")

    //读取hbase数据
    val hbaseConf = HBaseConfiguration.create()
    hbaseConf.set(TableInputFormat.INPUT_TABLE, "p1")
    val hbaseRDD = sc.newAPIHadoopRDD(hbaseConf,
      classOf[TableInputFormat],
      classOf[ImmutableBytesWritable],
      classOf[Result]
    )
    val rdd2 = hbaseRDD.map({
      case (_, result) => {
        val name = Bytes.toString(result.getValue(Bytes.toBytes("f"), Bytes.toBytes("name")))
        val age = Bytes.toString(result.getValue(Bytes.toBytes("f"), Bytes.toBytes("age")))

        // 转换成RDD[Row]
        (age, (name, 1))
      }
    })

    val rdd3 = rdd2.reduceByKey((tup1, tup2) => {
      val name1 = tup1._1
      val count1 = tup1._2

      val name2 = tup2._1
      val count2 = tup2._2

      //收集详情+ group_count
      (name1 + "," + name2, count1 + count2)
    })

    rdd3.foreach(tup => {
      val key = tup._1

      val tup_arr = tup._2
      val infostr = tup_arr._1
      val count = tup_arr._2


      println(key + "->" + count + "|" + infostr)
      println("----------")
      //      34->1|lisi3
      //      ----------
      //      25->2|lisi2,a1
      //      ----------
      //      23->1|lisi
      //      ----------
    })
  }

}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

根哥的博客

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值