package com.spark
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
object SimpleApp {
def main(args: Array[String]) {
val logFile = "D:\\test.txt" // Should be some file on your system
val sc = new SparkContext("local", "Simple App", "E:\\00_spark_scala\\spark\\spark-0.9.0-incubating-bin-hadoop1",
List("lib/spark-assembly_2.10-0.9.0-incubating-hadoop1.0.4.jar"))
val logData = sc.textFile(logFile, 2).cache()
val numAs = logData.filter(line => line.contains("a")).count()
val numBs = logData.filter(line => line.contains("b")).count()
println("Lines with a: %s, Lines with b: %s".format(numAs, numBs))
}
}
输出:
log4j:WARN No appenders could be found for logger (akka.event.slf4j.Slf4jLogger).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
14/04/04 14:58:10 INFO SparkEnv: Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
14/04/04 14:58:10 INFO SparkEnv: Registering BlockManagerMaster
14/04/04 14:58:10 INFO DiskBlockManager: Created local directory at C:\Users\WENBO_~1\AppData\Local\Temp\spark-local-20140404145810-9079
14/04/04 14:58:10 INFO MemoryStore: MemoryStore started with capacity 1068.8 MB.
14/04/04 14:58:10 INFO ConnectionManager: Bound socket to port 64567 with id = ConnectionManagerId(XA-NA18818395.allyes.group,64567)
14/04/04 14:58:10 INFO BlockManagerMaster: Trying to register BlockManager
14/04/04 14:58:10 INFO BlockManagerMasterActor$BlockManagerInfo: Registering block manager XA-NA18818395.allyes.group:64567 with 1068.8 MB RAM
14/04/04 14:58:10 INFO BlockManagerMaster: Registered BlockManager
14/04/04 14:58:10 INFO HttpServer: Starting HTTP Server
14/04/04 14:58:10 INFO HttpBroadcast: Broadcast server started at http://10.200.33.176:64568
14/04/04 14:58:10 INFO SparkEnv: Registering MapOutputTracker
14/04/04 14:58:10 INFO HttpFileServer: HTTP File server directory is C:\Users\WENBO_~1\AppData\Local\Temp\spark-fd43358e-905e-4af3-96f8-96145b92acde
14/04/04 14:58:10 INFO HttpServer: Starting HTTP Server
14/04/04 14:58:11 INFO SparkUI: Started Spark Web UI at http://XA-NA18818395.allyes.group:4040
14/04/04 14:58:11 INFO SparkContext: Added JAR lib/spark-assembly_2.10-0.9.0-incubating-hadoop1.0.4.jar at http://10.200.33.176:64569/jars/spark-assembly_2.10-0.9.0-incubating-hadoop1.0.4.jar with timestamp 1396594691413
14/04/04 14:58:11 INFO MemoryStore: ensureFreeSpace(32960) called with curMem=0, maxMem=1120744243
14/04/04 14:58:11 INFO MemoryStore: Block broadcast_0 stored as values to memory (estimated size 32.2 KB, free 1068.8 MB)
14/04/04 14:58:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
14/04/04 14:58:11 WARN LoadSnappy: Snappy native library not loaded
14/04/04 14:58:11 INFO FileInputFormat: Total input paths to process : 1
14/04/04 14:58:11 INFO SparkContext: Starting job: count at test.scala:12
14/04/04 14:58:11 INFO DAGScheduler: Got job 0 (count at test.scala:12) with 2 output partitions (allowLocal=false)
14/04/04 14:58:11 INFO DAGScheduler: Final stage: Stage 0 (count at test.scala:12)
14/04/04 14:58:11 INFO DAGScheduler: Parents of final stage: List()
14/04/04 14:58:11 INFO DAGScheduler: Missing parents: List()
14/04/04 14:58:11 INFO DAGScheduler: Submitting Stage 0 (FilteredRDD[2] at filter at test.scala:12), which has no missing parents
14/04/04 14:58:11 INFO DAGScheduler: Submitting 2 missing tasks from Stage 0 (FilteredRDD[2] at filter at test.scala:12)
14/04/04 14:58:11 INFO TaskSchedulerImpl: Adding task set 0.0 with 2 tasks
14/04/04 14:58:11 INFO TaskSetManager: Starting task 0.0:0 as TID 0 on executor localhost: localhost (PROCESS_LOCAL)
14/04/04 14:58:11 INFO TaskSetManager: Serialized task 0.0:0 as 1700 bytes in 4 ms
14/04/04 14:58:11 INFO Executor: Running task ID 0
14/04/04 14:58:11 INFO Executor: Fetching http://10.200.33.176:64569/jars/spark-assembly_2.10-0.9.0-incubating-hadoop1.0.4.jar with timestamp 1396594691413
14/04/04 14:58:11 INFO Utils: Fetching http://10.200.33.176:64569/jars/spark-assembly_2.10-0.9.0-incubating-hadoop1.0.4.jar to C:\Users\WENBO_~1\AppData\Local\Temp\fetchFileTemp7285086741261208295.tmp
14/04/04 14:58:23 INFO Executor: Adding file:/C:/Users/WENBO_~1/AppData/Local/Temp/spark-f2a903e2-1bee-43fe-983c-0a6f09969078/spark-assembly_2.10-0.9.0-incubating-hadoop1.0.4.jar to class loader
14/04/04 14:58:23 INFO BlockManager: Found block broadcast_0 locally
14/04/04 14:58:23 INFO CacheManager: Partition rdd_1_0 not found, computing it
14/04/04 14:58:23 INFO HadoopRDD: Input split: file:/D:/test.txt:0+2418
14/04/04 14:58:23 INFO MemoryStore: ensureFreeSpace(5720) called with curMem=32960, maxMem=1120744243
14/04/04 14:58:23 INFO MemoryStore: Block rdd_1_0 stored as values to memory (estimated size 5.6 KB, free 1068.8 MB)
14/04/04 14:58:23 INFO BlockManagerMasterActor$BlockManagerInfo: Added rdd_1_0 in memory on XA-NA18818395.allyes.group:64567 (size: 5.6 KB, free: 1068.8 MB)
14/04/04 14:58:23 INFO BlockManagerMaster: Updated info of block rdd_1_0
14/04/04 14:58:23 INFO Executor: Serialized size of result for 0 is 563
14/04/04 14:58:23 INFO Executor: Sending result for 0 directly to driver
14/04/04 14:58:23 INFO Executor: Finished task ID 0
14/04/04 14:58:23 INFO TaskSetManager: Starting task 0.0:1 as TID 1 on executor localhost: localhost (PROCESS_LOCAL)
14/04/04 14:58:23 INFO TaskSetManager: Serialized task 0.0:1 as 1700 bytes in 1 ms
14/04/04 14:58:23 INFO Executor: Running task ID 1
14/04/04 14:58:23 INFO TaskSetManager: Finished TID 0 in 11529 ms on localhost (progress: 0/2)
14/04/04 14:58:23 INFO BlockManager: Found block broadcast_0 locally
14/04/04 14:58:23 INFO DAGScheduler: Completed ResultTask(0, 0)
14/04/04 14:58:23 INFO CacheManager: Partition rdd_1_1 not found, computing it
14/04/04 14:58:23 INFO HadoopRDD: Input split: file:/D:/test.txt:2418+2419
14/04/04 14:58:23 INFO MemoryStore: ensureFreeSpace(5216) called with curMem=38680, maxMem=1120744243
14/04/04 14:58:23 INFO MemoryStore: Block rdd_1_1 stored as values to memory (estimated size 5.1 KB, free 1068.8 MB)
14/04/04 14:58:23 INFO BlockManagerMasterActor$BlockManagerInfo: Added rdd_1_1 in memory on XA-NA18818395.allyes.group:64567 (size: 5.1 KB, free: 1068.8 MB)
14/04/04 14:58:23 INFO BlockManagerMaster: Updated info of block rdd_1_1
14/04/04 14:58:23 INFO Executor: Serialized size of result for 1 is 563
14/04/04 14:58:23 INFO Executor: Sending result for 1 directly to driver
14/04/04 14:58:23 INFO Executor: Finished task ID 1
14/04/04 14:58:23 INFO TaskSetManager: Finished TID 1 in 11 ms on localhost (progress: 1/2)
14/04/04 14:58:23 INFO DAGScheduler: Completed ResultTask(0, 1)
14/04/04 14:58:23 INFO TaskSchedulerImpl: Remove TaskSet 0.0 from pool
14/04/04 14:58:23 INFO DAGScheduler: Stage 0 (count at test.scala:12) finished in 11.545 s
14/04/04 14:58:23 INFO SparkContext: Job finished: count at test.scala:12, took 11.632424408 s
14/04/04 14:58:23 INFO SparkContext: Starting job: count at test.scala:13
14/04/04 14:58:23 INFO DAGScheduler: Got job 1 (count at test.scala:13) with 2 output partitions (allowLocal=false)
14/04/04 14:58:23 INFO DAGScheduler: Final stage: Stage 1 (count at test.scala:13)
14/04/04 14:58:23 INFO DAGScheduler: Parents of final stage: List()
14/04/04 14:58:23 INFO DAGScheduler: Missing parents: List()
14/04/04 14:58:23 INFO DAGScheduler: Submitting Stage 1 (FilteredRDD[3] at filter at test.scala:13), which has no missing parents
14/04/04 14:58:23 INFO DAGScheduler: Submitting 2 missing tasks from Stage 1 (FilteredRDD[3] at filter at test.scala:13)
14/04/04 14:58:23 INFO TaskSchedulerImpl: Adding task set 1.0 with 2 tasks
14/04/04 14:58:23 INFO TaskSetManager: Starting task 1.0:0 as TID 2 on executor localhost: localhost (PROCESS_LOCAL)
14/04/04 14:58:23 INFO TaskSetManager: Serialized task 1.0:0 as 1703 bytes in 0 ms
14/04/04 14:58:23 INFO Executor: Running task ID 2
14/04/04 14:58:23 INFO BlockManager: Found block broadcast_0 locally
14/04/04 14:58:23 INFO BlockManager: Found block rdd_1_0 locally
14/04/04 14:58:23 INFO Executor: Serialized size of result for 2 is 563
14/04/04 14:58:23 INFO Executor: Sending result for 2 directly to driver
14/04/04 14:58:23 INFO Executor: Finished task ID 2
14/04/04 14:58:23 INFO TaskSetManager: Starting task 1.0:1 as TID 3 on executor localhost: localhost (PROCESS_LOCAL)
14/04/04 14:58:23 INFO TaskSetManager: Serialized task 1.0:1 as 1703 bytes in 0 ms
14/04/04 14:58:23 INFO Executor: Running task ID 3
14/04/04 14:58:23 INFO TaskSetManager: Finished TID 2 in 5 ms on localhost (progress: 0/2)
14/04/04 14:58:23 INFO DAGScheduler: Completed ResultTask(1, 0)
14/04/04 14:58:23 INFO BlockManager: Found block broadcast_0 locally
14/04/04 14:58:23 INFO BlockManager: Found block rdd_1_1 locally
14/04/04 14:58:23 INFO Executor: Serialized size of result for 3 is 563
14/04/04 14:58:23 INFO Executor: Sending result for 3 directly to driver
14/04/04 14:58:23 INFO Executor: Finished task ID 3
14/04/04 14:58:23 INFO TaskSetManager: Finished TID 3 in 5 ms on localhost (progress: 1/2)
14/04/04 14:58:23 INFO DAGScheduler: Completed ResultTask(1, 1)
14/04/04 14:58:23 INFO TaskSchedulerImpl: Remove TaskSet 1.0 from pool
14/04/04 14:58:23 INFO DAGScheduler: Stage 1 (count at test.scala:13) finished in 0.009 s
14/04/04 14:58:23 INFO SparkContext: Job finished: count at test.scala:13, took 0.014911306 s
Lines with a: 4, Lines with b: 48
需要引入的jar包有:\spark-0.9.0-incubating-bin-hadoop1\assembly\target\scala-2.10\spark-assembly_2.10-0.9.0-incubating-hadoop1.0.4.jar