CDH5.6:spark(rdd, sql)性能对比

  • 1, 目标和要求:使用spark读取hbase数据:(统计表的总数据量,字段求和),查看数据量对计算速度的影响
  • 2, 数据样本:hbase 表:60万条数据, 两种计算方式对比结果:
rdd使用cache()单次计算: 90秒计算量增加不影响时间消耗编程难度较大
df 使用registerTempTable(" xx ")单次计算: 110秒计算量增加会逐步增加时间的消耗编程难度较小
spark rdd----- > 1倍数据量:  time= 91 481ms 	(1.5分钟)
                           		   89042
                             	  80385
							  
				 10倍数据量:       94 082 ms (时间几乎一样)
							  
spark sql----- > 1倍数据量:  time=145 012ms	(2.5分钟 )
                           		  105295
                             	 116203
							 
				  10倍数据量:      375 126ms (时间是3倍)

pom.xml文件

<repositories>
        <repository>
            <id>cloudera</id>
            <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
        </repository>
    </repositories>

    <dependencies>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-client</artifactId>
            <version>1.2.0</version>
        </dependency>

        <dependency>
        <groupId>org.apache.hbase</groupId>
        <artifactId>hbase-server</artifactId>
        <version>1.2.0</version>
        </dependency>

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.10</artifactId>
            <version>1.5.0-cdh5.6.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_2.10</artifactId>
            <version>1.5.0-cdh5.6.1</version>
        </dependency>
    </dependencies>

1, spark-sql计算代码

import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SQLContext

object SparkSqlHbase {
  case class Per(val  DEPT:String, val  COST:Long,val  PERSON_NAME:String)

  def main(args: Array[String]): Unit = {
    val time1 = System.currentTimeMillis()
    val loop_counts=args(0).toInt

    //spark配置
    val conf = new SparkConf()
    conf.setAppName("sql-hbase:使用sql语句")
  conf.setMaster("local[*]")
    val sc = new SparkContext(conf)

   // 读取hbase数据
    val hbaseconf = HBaseConfiguration.create()
    hbaseconf.set(TableInputFormat.INPUT_TABLE,"ns1:person")
    val  rdd=sc.newAPIHadoopRDD(
      hbaseconf,
      classOf[TableInputFormat],
      classOf[ImmutableBytesWritable],
      classOf[Result]
    )

    val  rdd2=rdd.map({case (_,result) =>

      try{
        val id = Bytes.toString( result.getValue("f".getBytes(), "DEPT".getBytes()))
        val cost =Bytes.toString( result.getValue("f".getBytes(),"COST".getBytes())).toInt
        val name =Bytes.toString( result.getValue("f".getBytes(),"PERSON_NAME".getBytes()))
        Per(id,cost,name)
      }catch {
        case _=> println("error...");Per("",0,"")
      }

    })

    //变换dataFrame
    val sqlContext = new SQLContext(sc)
    import  sqlContext.implicits._
    val df = rdd2.toDF()
    df.registerTempTable("person")

    for( i <- (0  until  loop_counts)){
      println("===========>")
      sqlContext.sql("select count(DEPT) from  person").show()
      sqlContext.sql("select sum(COST) from  person").show()
    }
    println("===========> time= "+ (System.currentTimeMillis()-time1))
    sqlContext.dropTempTable("person")
    sc.stop()
  }
}

2, rdd计算代码

import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}

object SparkSqlHbase2 {

  case class Per(val  DEPT:String, val  COST:Long,val  PERSON_NAME:String)

  def main(args: Array[String]): Unit = {
    val time1 = System.currentTimeMillis()
    val loop_counts = args(0).toInt

    //spark配置
    val conf = new SparkConf()
    conf.setAppName("rdd-hbase:计算")
    conf.setMaster("local[*]")
    val sc = new SparkContext(conf)

    // 读取hbase数据
    val hbaseconf = HBaseConfiguration.create()
    hbaseconf.set(TableInputFormat.INPUT_TABLE, "ns1:person")
    val rdd = sc.newAPIHadoopRDD(
      hbaseconf,
      classOf[TableInputFormat],
      classOf[ImmutableBytesWritable],
      classOf[Result]
    )

    val rdd2 = rdd.map({ case (_, result) =>
      val id = Bytes.toString(result.getValue("f".getBytes(), "DEPT".getBytes()))
      val cost =Bytes.toString( result.getValue("f".getBytes(),"COST".getBytes())).toInt
      val name = Bytes.toString(result.getValue("f".getBytes(), "PERSON_NAME".getBytes()))
      ("COST",cost)
    })
    rdd2.cache()

    for (i <- (0 until loop_counts)) {
      val totalrows = rdd2.count()
      val totalCost = rdd2.reduceByKey(_+_).collect()(0)._2
      println("===========>")
      println("totalrows=  "+ totalrows +", "+ "totalCost= "+ totalCost)
    }

    println("===========> time= " + (System.currentTimeMillis() - time1))
    sc.stop()
  }
}

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

根哥的博客

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值