package com.analysis
import java.sql.DriverManager
import java.text.SimpleDateFormat
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapred.TableOutputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{HBaseConfiguration, HConstants}
import org.apache.hadoop.mapred.JobConf
import org.apache.log4j.{Level, Logger}
import org.apache.spark.rdd.JdbcRDD
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by songmingqi on 2018/8/27
*/
object Infectious {
val list = Array(“A20”,”A00”,”J18”,”B24”,”B15”,”B16”,”B17”,”B18”,
“B19”,”A80”,”A33”,”A36”,”A37”,”A01”,”A16”,”A22”,”A90”,”A82”,”A09”,
“B05”,”A75”,”A30”,”B06”,”B26”,”J11”,”B06”,”B50”,”B51”,”B52”, “B53”,
“B54”,”B65”,”A27”,”A51”,”A52”,”A53”,”B74”,”A38”,”P35”,”B08”,”B30”)
def main(args: Array[String]): Unit = {
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
val url = "oracle数据库url"
val username = "数据库用户"
val password = "密码"
val query = "select a.id_no,b.icd_code,b.start_time from sehr_xman a inner join sehr_xman_event_20170101 b on a.id = b.xman_id where a.id_no is not null and b.icd_code is not null and rownum >= ? and rownum < ? "
val conf = new SparkConf()
.setAppName("SehrXmanEhr")
.setMaster("local[5]")
//.setMaster("yarn")
val sc = new SparkContext(conf)
val connection = () => {
Class.forName("oracle.jdbc.driver.OracleDriver").newInstance()
DriverManager.getConnection(url, username, password)
}
// val dataTime = new SimpleDateFormat(“yyyy-MM-dd HH:mm:ss”)
val jdbcRdd = new JdbcRDD(
sc,
connection,
query,
0, 8000, 1,
r => (r.getString(“id_no”), r.getString(“start_time”).toString, r.getString(“icd_code”)))
//jdbcRdd.map(x => x).foreach(println)
//转换时间格式
val dataTime = new SimpleDateFormat(“yyyy-MM-dd HH:mm:ss”)
val rdd = jdbcRdd.map(x=>{
/*val pattern = “/^[A-Z]\d{2}\.\d{3}$/”.r
val x3 = pattern.findFirstIn(x._3)*/
val x3 = x._3.split("\\.")(0)
if (list.contains(x3)) {
((x._1,x._3),x._2)
}else null
}).filter(_!= null)
/*val sqlContext = SparkSession.builder().config(conf).getOrCreate()
import sqlContext.implicits._*/
/*val hc = HBaseConfiguration.create()
hc.set(HConstants.ZOOKEEPER_QUORUM, "zoe-001:2181,zoe-002:2181,zoe-003:2181")
hc.set(TableInputFormat.INPUT_TABLE, "test:test")*/
//创建hbase连接配置
val hbaseConf = HBaseConfiguration.create()
var job = new JobConf(hbaseConf)
job.set(HConstants.ZOOKEEPER_QUORUM, "zookeeper端口号:2181,zookeeper端口号:2181,zookeeper端口号:2181")
//给job设置任务格式,对student进行写操作
job.set(TableOutputFormat.OUTPUT_TABLE,"test:test")
job.setOutputFormat(classOf[TableOutputFormat])
/**
* 写数据到hbase {"name":"传染病","code":"Infectious", "idc10":"A16","startTime":"2018-01-15 08:57:46"}
* 列设计 p:Infectious_A16
* */
rdd.groupBy(_._1).
map(x=>{
val lastvalue = x._2.toArray.max; lastvalue
var put = new Put(Bytes.toBytes(lastvalue._1._1+"_"+"Infectious"+"_"+lastvalue._1._2))
put.addColumn(Bytes.toBytes("t"),Bytes.toBytes("icd10_catalog"),Bytes.toBytes(lastvalue._1._2.split("\\.")(0)))
put.addColumn(Bytes.toBytes("t"),Bytes.toBytes("startTime"),Bytes.toBytes(lastvalue._2))
put.addColumn(Bytes.toBytes("t"),Bytes.toBytes("icd10"),Bytes.toBytes(lastvalue._1._2))
(new ImmutableBytesWritable(),put)
//将每一个元素封装成一个对象,因为后面需要调用Hadoop的API进行操作
//所以,封装的put对象就不仅仅是一个put对象了,而是一个(k,v):仿造hadoop的map端
/*(lastvalue._1._1,"{\"name\":\"传染病\",\"icd10\":\""+ lastvalue._1._2.split("\\.")(0) + "\",\"startTime\":\""+ lastvalue._2 +"\"}")*/
}).saveAsHadoopDataset(job)
/*.toDF()
.show(8000,false)*/
/*.write
.mode(SaveMode.Append)
.options(Map(HBaseTableCatalog.tableCatalog -> catalog, HBaseTableCatalog.newTable -> "5"))
.format("org.apache.spark.sql.execution.datasources.hbase")
.save()*/
sc.stop()
}
// 存储到HBASE结构
def catalog =
s”“”{
|”table”:{“namespace”:”test”, “name”:”test”},
|”rowkey”:”_1”,
|”columns”:{
|”_1”:{“cf”:”rowkey”,”col”:”_1”,”type”:”string”},
|”_2”:{“cf”:”p”,”col”:”Infectious”,”type”:”string”}
|}
|}”“”.stripMargin
}