一.代码
package com.xiaopeng.test
import java.sql.Connection
import com.xiaopeng.bi.utils.{JdbcUtil, SparkUtils} import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql.hive.HiveContext import org.apache.spark.sql.{DataFrame, Row, SQLContext} import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable.ArrayBuffer
/** * Created by kequan on 3/27/17. */ object Test { def main(args: Array[String]): Unit = { //创建各种上下文 val sparkConf = new SparkConf().setAppName(this.getClass.getName.replace("$", "")); SparkUtils.setMaster(sparkConf); val sc = new SparkContext(sparkConf); val hiveContext = new HiveContext(sc); val sqlContext = new SQLContext(sc); hiveContext.sql("use yyft")
val df: DataFrame = hiveContext.sql("select * from game_sdk")
var rows = new ArrayBuffer[Row]() var rowsBroadCast: Broadcast[ArrayBuffer[Row]] = sc.broadcast(rows); df.rdd.foreachPartition(rdd => { val conn: Connection = JdbcUtil.getConn(); val stmt = conn.createStatement(); rdd.foreach(row => { rowsBroadCast.value.+=(row) }) })
var rows2 = new ArrayBuffer[Row]() var rowsBroadCast2: Broadcast[ArrayBuffer[Row]] = sc.broadcast(rows); df.rdd.foreach(row => { rowsBroadCast2.value.+=(row) })
for (param <- rows) { println(param.toString()) // 有数据 } println("---------------------------------") for (param <- rows2) { println(param.toString()) // 没有数据 } } } |
二.总结
1.foreachPartition 是在每个 Partition 执行,所以 共用的资源要广播 ,广播的 资源要序列化 ,ArrayBuffer默认已经序列化
2.foreach 原来是一个假方法,压根就读不到数据