importjava.util.Propertiesimportorg.apache.hadoop.hbase.HBaseConfigurationimportorg.apache.hadoop.hbase.client.Resultimportorg.apache.hadoop.hbase.io.ImmutableBytesWritableimportorg.apache.hadoop.hbase.mapreduce.TableInputFormatimportorg.apache.hadoop.hbase.util.Bytesimportorg.apache.spark.sql.SQLContextimportorg.apache.spark.{SparkConf,SparkContext}
object SparkSqlHbase2_write_mysql{
def main(args:Array[String]):Unit={//创建spark配置
val conf =newSparkConf()
conf.setMaster("local[*]").setAppName("test")
val sc =newSparkContext(conf)
val sqlContext =newSQLContext(sc)//读取hbase数据
val hbaseConf =HBaseConfiguration.create()
hbaseConf.set(TableInputFormat.INPUT_TABLE,"p1")
val hbaseRDD = sc.newAPIHadoopRDD(hbaseConf,
classOf[TableInputFormat],
classOf[ImmutableBytesWritable],
classOf[Result])//rdd-->df
val tupRDD = hbaseRDD.map({case(_, result)=>{
val name =Bytes.toString(result.getValue(Bytes.toBytes("f"),Bytes.toBytes("name")))
val age =Bytes.toString(result.getValue(Bytes.toBytes("f"),Bytes.toBytes("age")))// 转换成RDD[Row](name,age)}})//构造schema, rdd->转成DataFrame, 注册表importsqlContext.implicits._
val df = tupRDD.toDF("name","age")
df.registerTempTable("person")
val df2 = sqlContext.sql("select name,age from person")
df2.show()//数据转存到mysql: test.person
val prop =newProperties()
prop.put("user","root")
prop.put("password","123456")
df2.write.mode("append").jdbc("jdbc:mysql://localhost:3306/test","person",prop)}}
2, rdd实现hbase sql查询
I模拟数据源1:mysql
val dfreader=sqlContext.read.format("jdbc").option("url","jdbc:mysql://localhost:3306/test").option("driver","com.mysql.jdbc.Driver").option("dbtable","person").option("user","root").option("password","123456")
val df = dfreader.load()
df.show()//打印数据
II, 模拟数据源:textfile
wang@controller:~$ cat a.txt
a,23,java
a1,23,java
a2,23,java
a3,23,java
b,23,py
c,45,go
d,34,go
###rdd编程
sc.setLogLevel("ERROR")
val rdd = sc.textFile("file:///home/wang/a.txt")
val rdd2=rdd.map(str=>{
val infos = str.split(",")(infos(2),(infos.mkString(","),1))})
val rdd3 = rdd2.reduceByKey((tup1,tup2)=>{
val key1=tup1._1
val key2=tup2._1
val extrMesg=key1+";"+key2
val count= tup1._2+tup2._2
(extrMesg,count)})
rdd3.foreach(tup=>{
val key = tup._1
val tup_arr = tup._2
val infostr = tup_arr._1
val count = tup_arr._2
println(key+"->"+count+"|"+infostr)// py->1|b,23,py// java->4|a,23,java;a1,23,java;a2,23,java;a3,23,java})
III 真实数据源:hbase
importorg.apache.hadoop.hbase.HBaseConfigurationimportorg.apache.hadoop.hbase.client.Resultimportorg.apache.hadoop.hbase.io.ImmutableBytesWritableimportorg.apache.hadoop.hbase.mapreduce.TableInputFormatimportorg.apache.hadoop.hbase.util.Bytesimportorg.apache.spark.{SparkConf,SparkContext}
object Count3{
def main(args:Array[String]):Unit={
val conf =newSparkConf().setMaster("local[*]").setAppName("test")
val sc =newSparkContext(conf)
sc.setLogLevel("ERROR")//读取hbase数据
val hbaseConf =HBaseConfiguration.create()
hbaseConf.set(TableInputFormat.INPUT_TABLE,"p1")
val hbaseRDD = sc.newAPIHadoopRDD(hbaseConf,
classOf[TableInputFormat],
classOf[ImmutableBytesWritable],
classOf[Result])
val rdd2 = hbaseRDD.map({case(_, result)=>{
val name =Bytes.toString(result.getValue(Bytes.toBytes("f"),Bytes.toBytes("name")))
val age =Bytes.toString(result.getValue(Bytes.toBytes("f"),Bytes.toBytes("age")))// 转换成RDD[Row](age,(name,1))}})
val rdd3 = rdd2.reduceByKey((tup1, tup2)=>{
val name1 = tup1._1
val count1 = tup1._2
val name2 = tup2._1
val count2 = tup2._2
//收集详情+ group_count(name1 +","+ name2, count1 + count2)})
rdd3.foreach(tup =>{
val key = tup._1
val tup_arr = tup._2
val infostr = tup_arr._1
val count = tup_arr._2
println(key +"->"+ count +"|"+ infostr)println("----------")// 34->1|lisi3// ----------// 25->2|lisi2,a1// ----------// 23->1|lisi// ----------})}}