从mongoDB,使用SPARK来抓取数据, 主要针对无埋点数据。

val spark = SparkSession
  .builder()
  .master("local[*]")
  .appName("Spark")
  .config("spark.sql.warehouse.dir","file:///root/project/dcproject/spark-warehouse")
  .config("hive.metastore.uris","thrift://sy-003.hadoop:9083")
  .config("fs.defaultFS", "hdfs://cluster")
  .config("dfs.nameservices","cluster")
  .config("dfs.ha.namenodes.cluster","nn1,nn2")
  .config("dfs.namenode.rpc-address.cluster.nn1","sy-002.hadoop:8020")
  .config("dfs.namenode.rpc-address.cluster.nn2","sy-003.hadoop:8020")
  .config("dfs.client.failover.proxy.provider.cluster","org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider")
  .config("spark.mongodb.input.uri","mongodb://192.168.56.17:27017/growingio.webpv")
//  .config("spark.mongodb.output.uri","mongodb://192.168.56.17:27017/growingio.webpv")
  .enableHiveSupport()
  .getOrCreate()
这里是 SparkSession 的添加,除了启用hive 后,还可以直接为mongoDB添加配置.


package OutAnalysis

/*
* 无埋点数据 用户行为统计分析
* author: 谭志坚
* 2017-02-24
*

 */

import java.io.IOException

import DAO.{ScalaConn, ScalaHbase}
import com.mongodb.spark.MongoSpark
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.client.{Connection, Put, Table}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{MasterNotRunningException, TableName, ZooKeeperConnectionException}
import org.apache.spark.sql.{SparkSession, DataFrame}
import org.slf4j.LoggerFactory
import com.mongodb.spark.config._
import com.mongodb.spark._
import scala.collection.JavaConverters._
import org.bson.Document

case class webaction(_id:String,p:String,b:String,s:String,t:String,d:String,u:String,v:String,x:String,h:String,ptm:String,tm:String )
@SerialVersionUID(-1)
object DW_WMD_WEBACTION {
  def LOG = LoggerFactory.getLogger(getClass)

  @throws(classOf[Exception])
  def DW_WMD_WEBACTION_run( conf: Configuration,hconn: Connection) {

    val familyColumn: Array[String] = Array[String]("USERS")
    ScalaHbase.createTable(conf,hconn, "DW_WMD_WEBACTION", familyColumn)
    val DW_WMD_WEBACTION: Table = hconn.getTable(TableName.valueOf("DW_WMD_WEBACTION"))

    var maxdate: String = ScalaConn.getMaxDate(conf,hconn,DW_WMD_WEBACTION, "USERS", "TM")
//    val date: Date = new Date();
//    var maxdt: Timestamp = new Timestamp(date.getTime) ;

    if (maxdate.contains("-")) {
      maxdate =  "0"
    }

    val spark = ScalaConn.spark

    val readConfig = ReadConfig(Map("collection" -> "webaction", "readPreference.name" -> "secondaryPreferred"), Some(ReadConfig(spark.sparkContext)))
//    val webaction = spark.sparkContext.loadFromMongoDB(ReadConfig( Map("uri" -> "mongodb://192.168.56.17:27017/growingio.webaction"))).toDF()
    val webaction = MongoSpark.load(spark.sparkContext,readConfig).toDF()
    webaction.createOrReplaceTempView("webaction")
    val DW_WMD_WEBACTION_sql = "SELECT _id,p,s,t,d,u,v,x,h,ptm,tm FROM webaction WHERE tm >= "+maxdate
   var filteredDF: DataFrame = spark.sql(DW_WMD_WEBACTION_sql)
//    filteredDF.show()
    try {
        filteredDF.collect().foreach { userGetLoginRow => {
        val id: String = String.valueOf(userGetLoginRow.get(0)).replace("[","").replace("]","")

        val p: String =  String.valueOf(userGetLoginRow.getString(1))
        val s: String = String.valueOf(userGetLoginRow.getString(2))
        val t: String = String.valueOf(userGetLoginRow.getString(3))
        val d: String = String.valueOf(userGetLoginRow.getString(4))
        var u:String = String.valueOf(userGetLoginRow.getString(5))
        var v:String = String.valueOf(userGetLoginRow.getString(6))
        var x:String = String.valueOf(userGetLoginRow.getString(7))
        var h:String = String.valueOf(userGetLoginRow.getString(8))
        var ptm:String = String.valueOf(userGetLoginRow.getLong(9))
        var tm:String = String.valueOf(userGetLoginRow.getLong(10))

        val rowKey: String = tm+id

        ScalaHbase.addRow(DW_WMD_WEBACTION, rowKey, "USERS", "ID", id)
        ScalaHbase.addRow(DW_WMD_WEBACTION, rowKey, "USERS", "P", p)
        ScalaHbase.addRow(DW_WMD_WEBACTION, rowKey, "USERS", "S", s)
        ScalaHbase.addRow(DW_WMD_WEBACTION, rowKey, "USERS", "T", t)
        ScalaHbase.addRow(DW_WMD_WEBACTION, rowKey, "USERS", "D", d)
        ScalaHbase.addRow(DW_WMD_WEBACTION, rowKey, "USERS", "U", u)
        ScalaHbase.addRow(DW_WMD_WEBACTION, rowKey, "USERS", "V", v)
        ScalaHbase.addRow(DW_WMD_WEBACTION, rowKey, "USERS", "X", x)
        ScalaHbase.addRow(DW_WMD_WEBACTION, rowKey, "USERS", "H", h)
        ScalaHbase.addRow(DW_WMD_WEBACTION, rowKey, "USERS", "PTM",ptm)
        ScalaHbase.addRow(DW_WMD_WEBACTION, rowKey, "USERS", "TM",tm)
        }
    }}
    catch {
      case e: Exception => {
        if (e.getClass == classOf[MasterNotRunningException]) {
          LOG.error("MasterNotRunningException:"+e.toString)
        }
        if (e.getClass == classOf[ZooKeeperConnectionException]) {
          LOG.error("ZooKeeperConnectionException:"+e.toString)
        }
        if (e.getClass == classOf[IOException]) {
          LOG.error("IOException:"+e.toString)
        }
        LOG.error("error xception:"+ e.printStackTrace())
      }
    } finally {
      filteredDF = null
      if (null != DW_WMD_WEBACTION) {
        DW_WMD_WEBACTION.close
      }
    }
  }
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值