Memory
execution: computation
storage: caching and propagating
1.6 StaticMemoryManager
1.6+ UnifiedMemoryManager
SizeEstimator.estimate(file)
List<Integer> list = new ArrayList<Integer>()
int[] array = new int[]
Map<Integer,Student> students = new HashMap<Integer,Student>()
id:name,age,grade.....$id:name,age,grade.....
1:大舅哥,25,1$2:二舅哥,24,2....
class Classes {
List<舅哥> s = new ArrayList<舅哥>();
}
json
数据本地化
RDD
preferedLocations
广播
map
function(map)
1 wenzi
2 xiaolu
1 school1 201
2 school2 202
3 school3 203
1 wenzi school1 201
2 xiaolu school2 202
a join b on a.id=b.id
val peoples = sc.parallelize(Array(("1","wenzi"),("2","xiaolu"))).map(x => (x._1, x))
val details = sc.parallelize(Array(
("1","school1","201"),
("2","school2","202"),
("3","school3","203")
)).map(x => (x._1, x))
peoples.join(details)
peoples.join(details).map(x => {
x._1 + "," + x._2._1._2 + "," + x._2._2._2 + "," + x._2._2._3
}).collect.foreach(println)
val peoples = sc.parallelize(Array(("1","wenzi"),("2","xiaolu")))
.collectAsMap()
val details = sc.parallelize(Array(
("1","school1","201"),
("2","school2","202"),
("3","school3","203")
)).map(x => (x._1, x))
val peoplesBroadcast = sc.broadcast(sc.parallelize(Array(("1","wenzi"),("2","xiaolu"))).collectAsMap())
details.mapPartitions(partition => {
val broadcastPeoples = peoplesBroadcast.value
for((key,value) <- partition if(broadcastPeoples.contains(key)))
yield (key, broadcastPeoples.getOrElse(key, ""), value._2, value._3)
}).collect().foreach(println)
./spark-shell --master yarn \
--executor-memory=1G \
--num-executors=2 \
--executor-cores=1
execution: computation
storage: caching and propagating
1.6 StaticMemoryManager
1.6+ UnifiedMemoryManager
SizeEstimator.estimate(file)
List<Integer> list = new ArrayList<Integer>()
int[] array = new int[]
Map<Integer,Student> students = new HashMap<Integer,Student>()
id:name,age,grade.....$id:name,age,grade.....
1:大舅哥,25,1$2:二舅哥,24,2....
class Classes {
List<舅哥> s = new ArrayList<舅哥>();
}
json
数据本地化
RDD
preferedLocations
广播
map
function(map)
1 wenzi
2 xiaolu
1 school1 201
2 school2 202
3 school3 203
1 wenzi school1 201
2 xiaolu school2 202
a join b on a.id=b.id
val peoples = sc.parallelize(Array(("1","wenzi"),("2","xiaolu"))).map(x => (x._1, x))
val details = sc.parallelize(Array(
("1","school1","201"),
("2","school2","202"),
("3","school3","203")
)).map(x => (x._1, x))
peoples.join(details)
peoples.join(details).map(x => {
x._1 + "," + x._2._1._2 + "," + x._2._2._2 + "," + x._2._2._3
}).collect.foreach(println)
val peoples = sc.parallelize(Array(("1","wenzi"),("2","xiaolu")))
.collectAsMap()
val details = sc.parallelize(Array(
("1","school1","201"),
("2","school2","202"),
("3","school3","203")
)).map(x => (x._1, x))
val peoplesBroadcast = sc.broadcast(sc.parallelize(Array(("1","wenzi"),("2","xiaolu"))).collectAsMap())
details.mapPartitions(partition => {
val broadcastPeoples = peoplesBroadcast.value
for((key,value) <- partition if(broadcastPeoples.contains(key)))
yield (key, broadcastPeoples.getOrElse(key, ""), value._2, value._3)
}).collect().foreach(println)
./spark-shell --master yarn \
--executor-memory=1G \
--num-executors=2 \
--executor-cores=1