val conf = new SparkConf()
conf.setMaster("local")
conf.setAppName("patition")
val sc = new SparkContext(conf)
//hbase
val hbaseConf = HBaseConfiguration.create()
//设置zooKeeper集群地址,也可以通过将hbase-site.xml导入classpath,但是建议在程序里这样设置
hbaseConf.set("zookeeper.znode.parent", "/hbase-cluster");
hbaseConf.set("hbase.zookeeper.quorum","node0,node1,node2")
//设置zookeeper连接端口,默认2181
hbaseConf.set("hbase.zookeeper.property.clientPort", "2181")
val tablename = "candle:test"
//初始化jobconf,TableOutputFormat必须是org.apache.hadoop.hbase.mapred包下的!
val jobConf = new JobConf(hbaseConf)
jobConf.setOutputFormat(classOf[TableOutputFormat])
jobConf.set(TableOutputFormat.OUTPUT_TABLE,tablename)
val sourceRDD = sc.parallelize(List(
"site1, user1, 2016-11-20 02:18:33",
"site1, user2, 2016-11-20 02:18:33",
"site1, user3, 2016-11-20 02:18:33",
"site1, user2, 2016-11-20 02:18:33",
"site1, user3, 2016-11-20 02:18:33",
"site1, user1, 2016-11-20 02:18:33"
))
//输出结果
//((2016-11-20,02,site1), (6,3))
sourceRDD
.map(line => {
val time = line.split(",")(2).trim
val data = time.split(" ")(0)
val hour = time.split(" ")(1).split(":")(0)
val site = line.split(",")(0).trim
val user = line.split(",")(1).trim
(data+","+hour+","+site, user)
})
.groupByKey()
//(2016-11-20,20,site1 ,[user1,.....] )
.map(seq => {
val pv = seq._2.toBuffer.size
val uv = seq._2.toBuffer.distinct.size
//(seq._1, (pv, uv))
//rowkey
val put = new Put(Bytes.toBytes(seq._1))
//put.add(Bytes.toBytes("cf"),Bytes.toBytes("name"),Bytes.toBytes(arr(1)))
put.addColumn(Bytes.toBytes("f1"), Bytes.toBytes("pv"), Bytes.toBytes(pv.toInt))
put.addColumn(Bytes.toBytes("f1"), Bytes.toBytes("uv"), Bytes.toBytes(uv.toInt))
(new ImmutableBytesWritable, put)
})
.saveAsHadoopDataset(jobConf)
}