- 1, 目标和要求:使用spark读取hbase数据:(统计表的总数据量,字段求和),查看数据量对计算速度的影响
- 2, 数据样本:hbase 表:60万条数据, 两种计算方式对比结果:
rdd使用cache() | 单次计算: 90秒 | 计算量增加不影响时间消耗 | 编程难度较大 |
---|---|---|---|
df 使用registerTempTable(" xx ") | 单次计算: 110秒 | 计算量增加会逐步增加时间的消耗 | 编程难度较小 |
spark rdd----- > 1倍数据量: time= 91 481ms (1.5分钟)
89042
80385
10倍数据量: 94 082 ms (时间几乎一样)
spark sql----- > 1倍数据量: time=145 012ms (2.5分钟 )
105295
116203
10倍数据量: 375 126ms (时间是3倍)
pom.xml文件
<repositories>
<repository>
<id>cloudera</id>
<url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
</repository>
</repositories>
<dependencies>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>1.2.0</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>1.2.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.10</artifactId>
<version>1.5.0-cdh5.6.1</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.10</artifactId>
<version>1.5.0-cdh5.6.1</version>
</dependency>
</dependencies>
1, spark-sql计算代码
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SQLContext
object SparkSqlHbase {
case class Per(val DEPT:String, val COST:Long,val PERSON_NAME:String)
def main(args: Array[String]): Unit = {
val time1 = System.currentTimeMillis()
val loop_counts=args(0).toInt
//spark配置
val conf = new SparkConf()
conf.setAppName("sql-hbase:使用sql语句")
conf.setMaster("local[*]")
val sc = new SparkContext(conf)
// 读取hbase数据
val hbaseconf = HBaseConfiguration.create()
hbaseconf.set(TableInputFormat.INPUT_TABLE,"ns1:person")
val rdd=sc.newAPIHadoopRDD(
hbaseconf,
classOf[TableInputFormat],
classOf[ImmutableBytesWritable],
classOf[Result]
)
val rdd2=rdd.map({case (_,result) =>
try{
val id = Bytes.toString( result.getValue("f".getBytes(), "DEPT".getBytes()))
val cost =Bytes.toString( result.getValue("f".getBytes(),"COST".getBytes())).toInt
val name =Bytes.toString( result.getValue("f".getBytes(),"PERSON_NAME".getBytes()))
Per(id,cost,name)
}catch {
case _=> println("error...");Per("",0,"")
}
})
//变换dataFrame
val sqlContext = new SQLContext(sc)
import sqlContext.implicits._
val df = rdd2.toDF()
df.registerTempTable("person")
for( i <- (0 until loop_counts)){
println("===========>")
sqlContext.sql("select count(DEPT) from person").show()
sqlContext.sql("select sum(COST) from person").show()
}
println("===========> time= "+ (System.currentTimeMillis()-time1))
sqlContext.dropTempTable("person")
sc.stop()
}
}
2, rdd计算代码
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
object SparkSqlHbase2 {
case class Per(val DEPT:String, val COST:Long,val PERSON_NAME:String)
def main(args: Array[String]): Unit = {
val time1 = System.currentTimeMillis()
val loop_counts = args(0).toInt
//spark配置
val conf = new SparkConf()
conf.setAppName("rdd-hbase:计算")
conf.setMaster("local[*]")
val sc = new SparkContext(conf)
// 读取hbase数据
val hbaseconf = HBaseConfiguration.create()
hbaseconf.set(TableInputFormat.INPUT_TABLE, "ns1:person")
val rdd = sc.newAPIHadoopRDD(
hbaseconf,
classOf[TableInputFormat],
classOf[ImmutableBytesWritable],
classOf[Result]
)
val rdd2 = rdd.map({ case (_, result) =>
val id = Bytes.toString(result.getValue("f".getBytes(), "DEPT".getBytes()))
val cost =Bytes.toString( result.getValue("f".getBytes(),"COST".getBytes())).toInt
val name = Bytes.toString(result.getValue("f".getBytes(), "PERSON_NAME".getBytes()))
("COST",cost)
})
rdd2.cache()
for (i <- (0 until loop_counts)) {
val totalrows = rdd2.count()
val totalCost = rdd2.reduceByKey(_+_).collect()(0)._2
println("===========>")
println("totalrows= "+ totalrows +", "+ "totalCost= "+ totalCost)
}
println("===========> time= " + (System.currentTimeMillis() - time1))
sc.stop()
}
}