一、步骤:在idea中编程
1.1 添加依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.example</groupId>
<artifactId>Spark</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.2.1</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive-thriftserver_2.11</artifactId>
<version>2.2.1</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.27</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hbase/hbase-client -->
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>1.2.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hbase/hbase-server -->
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>1.2.1</version>
</dependency>
</dependencies>
</project>
1.2 在集群中操作
1.开启hadoop,metastore服务
bin/hive --serivce metastore &
2.开启zookeeper
bin/zkServer.sh start //jps然后看到QuorumPeerMain表示开启成功
3.开启hbase
bin/start-hbase.sh //jps当看到HMaster和HRegionServer表示开启成功
4.获取hbase的配置信息,将hbase-site.xml文件放置到reosources目录中
1.3 编程
package Day.Day6
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.{HTable, Put}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{DoubleType, IntegerType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import scala.util.Random
object sql2hbase {
def main(args: Array[String]): Unit = {
//一、创建sparksession对象
val spark = SparkSession.builder()
.master("local[*]")
.appName("rdd2rdf")
.enableHiveSupport()
.getOrCreate()
//二、创建sc对象
val sc = spark.sparkContext
//三、创建rdd数据
val random = Random
val rdd: RDD[(Int, Double)] = sc.parallelize((0 to 10).flatMap(i => {
//随机得到重复的key的值
(0 to (random.nextInt(10) + 2)).map(j => i)
}).map(i => {
//随机得到value值
(i,random.nextDouble() * 1000 + 10)
}))
import spark.implicits._
val df1 = rdd.toDF("id", "sal")
df1.createTempView("user")
val df2: DataFrame = spark.sql(
"""
|select id,
|round(avg(sal),2)as sall
|from user
|group by id
|""".stripMargin)
val rdd2 = df2.rdd
rdd2.foreachPartition(iter=>{
//1.创建hbase配置对象
val conf = HBaseConfiguration.create()
//2.创建表对象
val table = new HTable(conf, "df2Hbase")
iter.foreach(row=>{
//3.获取rdd中的元素
val id = row.getAs[Int]("id")
val sal = row.getAs[Double]("sall")
//4.获取put对象,添加数据
val put = new Put(Bytes.toBytes(id))
put.add(Bytes.toBytes("info"),Bytes.toBytes("sal"),Bytes.toBytes(sal))
//5.表添加put
table.put(put)
})
table.close()
})
}
}