Spark操作数据库API

最新推荐文章于 2023-10-27 21:09:42 发布

fuzuxian

最新推荐文章于 2023-10-27 21:09:42 发布

阅读量453

点赞数

分类专栏： Spark 文章标签： Spark 数据库 API

本文链接：https://blog.csdn.net/image_fzx/article/details/82764609

版权

Spark 专栏收录该内容

10 篇文章 0 订阅

订阅专栏

写入Mysql数据库

import java.sql.{Connection, Date, DriverManager, PreparedStatement}
import org.apache.spark.{SparkConf, SparkContext}

object IPLocation {

  val data2MySQL = (iterator: Iterator[(String, Int)] )=> {

    var conn: Connection = null
    var ps: PreparedStatement = null
    val sql = " INSERT INTO location_info (location, counts, accesse_data) VALUE (?, ?, ?)"
    try{
      conn = DriverManager.getConnection(" jdbc:mysql://localhost:3306/home/hadoop", "hadoop", "000000fzx")
      iterator.foreach(line => {
        ps = conn.prepareStatement(sql)
        ps.setString(1, line._1)
        ps.setInt(2, line._2)
        ps.setDate(3, new Date(System.currentTimeMillis()))
        ps.executeLargeUpdate()
      })
    } catch {
      case e: Exception => println(" Mysql Exception")
    } finally {
      if (ps != null)
        ps.close()
      if (conn != null)
        conn.close()
    }
  }

  // IP--> Long 10
  def  ip2Long(ip: String): Long = {
    val fragments = ip.split("[.]")
    var ipNum = 0L
    for (i<-0 until fragments.length) {
      ipNum = fragments(i).toLong | ipNum << 8L
    }
    ipNum
  }

  // BinarySearch
  def binarySearch(lines:Array[(String, String, String)], ip:Long) : Int = {

    var low = 0
    var high = lines.length -1
    while (low<=high){
      val middle = (low + high) / 2
      if ((ip >= lines(middle)._1.toLong) && (ip <= lines(middle)._2.toLong))
        return middle
      if (ip < lines(middle)._1.toLong)
        high = middle - 1
      else {
        low = middle + 1
      }
    }
    -1
  }

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("IPLocation").setMaster()
    val sc = new SparkContext(conf)
    val ipRulesRdd = sc.textFile("   ").map( line => {

      val fields = line.split("\\|")
      val start_num = fields(2)
      val end_num = fields(3)
      val province = fields(6)
      (start_num, end_num, province)
    })
    // ip-rules
    val ipRulesArray = ipRulesRdd.collect()
    // broadcast-rules
    val ipRulesBroadcast = sc.broadcast(ipRulesArray)
    // data
    val ipsRDD = sc.textFile("   ").map( line => {
      val fields  = line.split("\\|")
      fields(1)
    })
    // result
    val result = ipsRDD.map( ip => {
      val ipNum = ip2Long(ip)
      val index = binarySearch(ipRulesBroadcast.value, ipNum)
      val info = ipRulesBroadcast.value(index)
      info
    }).map( t => (t._3, 1)).reduceByKey(_+_)

    result.foreachPartition(data2MySQL)
    println(result.collect().toBuffer)
    sc.stop()
  }
}

读取Mysql数据库的内容

import java.sql.DriverManager
import org.apache.spark.rdd.JdbcRDD
import org.apache.spark.{SparkConf, SparkContext}

object JdbcRddDemo {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("JdbcRddDemo").setMaster("local[2]")
    val sc = new SparkContext(conf)
    val connection =() => {
      Class.forName("com.mysql.jdbc.Driver").newInstance()
      DriverManager.getConnection("jdbc:mysql://localhost:3306/home/hadoop", "hadoop", "000000fzx")
    }
    val jdbcRdd = new JdbcRDD (
      sc,
      connection,
      "SELECT * FROM table_1 where id>= ? AND id <= ?",
      1, 4, 2,
      rs => {
        val id = rs.getInt(1)
        val code = rs.getString(2)
        (id, code)
      }
    )
    val jrdd = jdbcRdd.collect()
    println(jdbcRdd.collect().toBuffer)
    sc.stop()
  }
}