1.先启动hadoop,启动hdfs和yarn。在开发Spark时,仅需要启动hdfs:
cd /usr/local/hadoop-2.4.0/etc/hadoop
1 sbin/start-dfs.sh
2 sbin/start-yarn.sh
2.启动saprk
1 cd /usr/local/spark-1.1.0-bin-hadoop2.4/sbin
2 ./start-all.sh
3. 启动spark-shell
4.cd **bin
./spark-shell
退出shell
exit
cd /usr/local/hadoop-2.4.0/etc/hadoop
1 sbin/start-dfs.sh
2 sbin/start-yarn.sh
2.启动saprk
1 cd /usr/local/spark-1.1.0-bin-hadoop2.4/sbin
2 ./start-all.sh
3. 启动spark-shell
4.cd **bin
./spark-shell
退出shell
exit
RDD弹性分布式数据集
scala> val rdd = sc.parallelize(List(1,2,3,4,5,6))
rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[0] at parallelize at <console>:12
scala> val mapRdd = rdd.map(2 * _)
mapRdd: org.apache.spark.rdd.RDD[Int] = MappedRDD[1] at map at <console>:14
scala> mapRdd.collect
15/02/05 11:52:01 INFO spark.SparkContext: Starting job: collect at <console>:17
15/02/05 11:52:01 INFO scheduler.DAGScheduler: Got job 0 (collect at <console>:17) with 1 output partitions (allowLocal=false)
15/02/05 11:52:01 INFO scheduler.DAGScheduler: Final stage: Stage 0(collect at <console>:17)
15/02/05 11:52:01 INFO scheduler.DAGScheduler: Parents of final stage: List()
15/02/05 11:52:01 INFO scheduler.DAGScheduler: Missing parents: List()
15/02/05 11:52:01 INFO scheduler.DAGScheduler: Submitting Stage 0 (MappedRDD[1] at map at <console>:14), which has no missing parents
15/02/05 11:52:02 INFO storage.MemoryStore: ensureFreeSpace(1656) called with curMem=0, maxMem=280248975
15/02/05 11:52:02 INFO storage.MemoryStore: Block broadcast_0 stored as values in memory (estimated size 1656.0 B, free 267.3 MB)
15/02/05 11:52:03 INFO storage.MemoryStore: ensureFreeSpace(1211) called with curMem=1656, maxMem=280248975
15/02/05 11:52:03 INFO storage.MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 1211.0 B, free 267.3 MB)
15/02/05 11:52:03 INFO storage.BlockManagerInfo: Added broadcast_0_piece0 in memory on localhost:34086 (size: 1211.0 B, free: 267.3 MB)
15/02/05 11:52:03 INFO storage.BlockManagerMaster: Updated info of block broadcast_0_piece0
15/02/05 11:52:03 INFO spark.SparkContext: Created broadcast 0 from broadcast at DAGScheduler.scala:838
15/02/05 11:52:03 INFO scheduler.DAGScheduler: Submitting 1 missing tasks from Stage 0 (MappedRDD[1] at map at <console>:14)
15/02/05 11:52:03 INFO scheduler.TaskSchedulerImpl: Adding task set 0.0 with 1 tasks
15/02/05 11:52:03 INFO scheduler.TaskSetManager: Starting task 0.0 in stage 0.0 (TID 0, localhost, PROCESS_LOCAL, 1224 bytes)
15/02/05 11:52:03 INFO executor.Executor: Running task 0.0 in stage 0.0 (TID 0)
15/02/05 11:52:03 INFO executor.Executor: Finished task 0.0 in stage 0.0 (TID 0). 618 bytes result sent to driver
15/02/05 11:52:03 INFO scheduler.TaskSetManager: Finished task 0.0 in stage 0.0 (TID 0) in 405 ms on localhost (1/1)
15/02/05 11:52:03 INFO scheduler.DAGScheduler: Stage 0 (collect at <console>:17) finished in 0.506 s
15/02/05 11:52:04 INFO scheduler.DAGScheduler: Job 0 finished: collect at <console>:17, took 2.923469 s
15/02/05 11:52:04 INFO scheduler.TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool
res0: Array[Int] = Array(2, 4, 6, 8, 10, 12)
scala>
scala> mapRdd
res1: org.apache.spark.rdd.RDD[Int] = MappedRDD[1] at map at <console>:14
//*********************************
scala> val filterRdd = mapRdd.filter(_ > 5)
filterRdd: org.apache.spark.rdd.RDD[Int] = FilteredRDD[2] at filter at <console>:16
scala> filterRdd.collect
15/02/05 11:57:17 INFO spark.SparkContext: Starting job: collect at <console>:19
15/02/05 11:57:17 INFO scheduler.DAGScheduler: Got job 1 (collect at <console>:19) with 1 output partitions (allowLocal=false)
15/02/05 11:57:17 INFO scheduler.DAGScheduler: Final stage: Stage 1(collect at <console>:19)
15/02/05 11:57:17 INFO scheduler.DAGScheduler: Parents of final stage: List()
15/02/05 11:57:17 INFO scheduler.DAGScheduler: Missing parents: List()
15/02/05 11:57:17 INFO scheduler.DAGScheduler: Submitting Stage 1 (FilteredRDD[2] at filter at <console>:16), which has no missing parents
15/02/05 11:57:17 INFO storage.MemoryStore: ensureFreeSpace(1856) called with curMem=2867, maxMem=280248975
15/02/05 11:57:17 INFO storage.MemoryStore: Block broadcast_1 stored as values in memory (estimated size 1856.0 B, free 267.3 MB)
15/02/05 11:57:17 INFO storage.MemoryStore: ensureFreeSpace(1342) called with curMem=4723, maxMem=280248975
15/02/05 11:57:17 INFO storage.MemoryStore: Block broadcast_1_piece0 stored as bytes in memory (estimated size 1342.0 B, free 267.3 MB)
15/02/05 11:57:17 INFO storage.BlockManagerInfo: Added broadcast_1_piece0 in memory on localhost:34086 (size: 1342.0 B, free: 267.3 MB)
15/02/05 11:57:17 INFO storage.BlockManagerMaster: Updated info of block broadcast_1_piece0
15/02/05 11:57:17 INFO spark.SparkContext: Created broadcast 1 from broadcast at DAGScheduler.scala:838
15/02/05 11:57:17 INFO scheduler.DAGScheduler: Submitting 1 missing tasks from Stage 1 (FilteredRDD[2] at filter at <console>:16)
15/02/05 11:57:17 INFO scheduler.TaskSchedulerImpl: Adding task set 1.0 with 1 tasks
15/02/05 11:57:17 INFO scheduler.TaskSetManager: Starting task 0.0 in s