Kafka->Spark Streaming->mongodb

项目主要做实时,从kafka拉数据进行清洗保存到mongodb(注意:mongodb的save方法是根据_id替换整个事件),用空间数据查询geo

环境(maven管理):

<dependency>
    <groupId>org.scala-lang</groupId>
    <artifactId>scala-library</artifactId>
    <version>2.10.6</version>
</dependency>

<dependency>
    <groupId>org.apache.spark</groupId>
    <artifactId>spark-core_2.10</artifactId>
    <version>1.6.1</version>
</dependency>
<dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-client</artifactId>
    <version>2.6.4</version>
</dependency>

<dependency>
    <groupId>org.apache.spark</groupId>
    <artifactId>spark-streaming_2.10</artifactId>
    <version>1.6.1</version>
</dependency>

<dependency>
    <groupId>org.apache.spark</groupId>
    <artifactId>spark-streaming-kafka_2.10</artifactId>
    <version>1.6.1</version>
</dependency>

<dependency>
    <groupId>redis.clients</groupId>
    <artifactId>jedis</artifactId>
    <version>2.8.1</version>
</dependency>

<dependency>
    <groupId>org.mongodb.mongo-hadoop</groupId>
    <artifactId>mongo-hadoop-core</artifactId>
    <version>2.0.1</version>
</dependency>
<dependency>
    <groupId>org.mongodb</groupId>
    <artifactId>mongo-java-driver</artifactId>
    <version>2.13.0</version>
</dependency>
<dependency>
    <groupId>org.mongodb</groupId>
    <artifactId>casbah-core_2.10</artifactId>
    <version>3.1.1</version>
</dependency>

代码如下:

import com.mongodb.MongoClientOptions.Builder
import com.mongodb.casbah.commons.{Imports, MongoDBObject}
import com.mongodb._
import org.apache.spark.SparkConf
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.bson.{BSONObject, BasicBSONObject}

import scala.language.postfixOps

/**
 * Created by Administrator on 2016/9/6.
 */
object KafkaAndMongodb {

  val updateFunc=(iter:Iterator[(String,Seq[Int],Option[Int])]) =>{
    iter.flatMap{case (x,y,z) => Some(y.sum+z.getOrElse(0)).map(i => (x,i))}
  }


  def main(args: Array[String]) {
    LoggerLevels.setStreamingLogLevels()
    //转入参数 ,host,dbname,collection
   //zkQurnum,group,topic,numThreads,cachepath,host,dbname,collection
  //shizhanmini:2181,mini02:2181,mini03:2181 rto db100 2 c://ck2 192.168.199.7 spark oc
    var Array(zkQurnum,group,topic,numThreads,cachepath,host,dbname,collection)=args
    //创建sparkconf  测试本地模式
    val sparkConf= new SparkConf().setAppName("KafkaAndMongodb")
      .setMaster("local[2]")
    //创建流式处理对象
    val ssc= new StreamingContext(sparkConf,Seconds(3))
    ssc.checkpoint(cachepath)
    //将topic转为map
    val topicMap= topic.split(",").map((_,numThreads.toInt)).toMap
    
    //根据参数,创建Stream
    val lines= KafkaUtils.createStream(ssc,zkQurnum,group,topicMap,StorageLevel.MEMORY_AND_DISK).map(_._2)
    val users = lines.map(x => (x.split(",")(1),x.split(",")(3),x.split(",")(6),x.split(",")(7)))
    //初始化mongo
    //val db= getMongo(host,dbname)
   
    users.print()
    
    users.foreachRDD(rdd => {
      var counts:Int=0
      rdd.foreachPartition(par =>{
        //val db= MongodbUtils.getMongo()
        val db= getMongo(host,dbname)
        if(!par.isEmpty) {
          par.foreach(pair => {
            val id= pair._1
            val vname = pair._2
            val lat=pair._3.toFloat
            val lon=pair._4.toFloat
           // println(lat+"-----------geo-----------"+lon)
           
            val dBObject: Imports.DBObject = MongoDBObject("_id"->id,"name" -> vname,"geo" -> Array(lat,lon))
            db.getCollection(collection).save(dBObject)
          })
        
        }
      })
     
    })
    
    ssc.start()
    ssc.awaitTermination()



  }

  def getMongo(host:String,dbname:String): DB= {
    //初始化线程池
    val builder: Builder = MongoClientOptions.builder()
    builder.connectionsPerHost(500);// 与目标数据库可以建立的最大链接数
    builder.connectTimeout(1000 * 60 * 20);// 与数据库建立链接的超时时间
    builder.maxWaitTime(100 * 60 * 5);// 一个线程成功获取到一个可用数据库之前的最大等待时间
    builder.threadsAllowedToBlockForConnectionMultiplier(100);
    builder.maxConnectionIdleTime(0);
    builder.maxConnectionLifeTime(0);
    builder.socketTimeout(0);
    builder.socketKeepAlive(true);
    val options: MongoClientOptions = builder.build()
    var db:DB=null;
    var mongo:MongoClient=null;
    //判断参数为空,否则用默认
    if(host.isEmpty){
      mongo= new MongoClient("192.168.30.170",options)
    }else{
      mongo= new MongoClient(host,options)
    }
    //判断参数为空,否则用默认
    if(dbname.isEmpty){
      db = mongo.getDB("spark")
    }else{
      db = mongo.getDB(dbname)
    }
    
    db

  }
}

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值