scala-spark reduce,reduceByKey,sorted,lookup,take,saveAsTextFile

12 篇文章 0 订阅
import java.text.SimpleDateFormat
import java.util.Date

import org.apache.spark._
object Reduce_demo {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("Transformation1").setMaster("local")
    val spark = new SparkContext(conf)
    //action
    var a1 = spark.parallelize(List(('a', 1), ('b', 1)))
    var a2 = spark.parallelize(List(('c', 1), ('d', 1)))
    var a3 = spark.parallelize(List(('a', 1), ('b', 1), ('a', 1)))
    var a4 = spark.parallelize(List(('c', 1), ('d', 1), ('b', 1), ('b', 2), ('b', 3), ('a', 1), ('a', 2)))
    for (i <- a4.lookup('a')) println(i)

    //reduce
    //    var a5=spark.parallelize(List(1,2,4,3,2,5,7,1,3,4))
    var a5=spark.parallelize(List(1,2,4,3,5))
    println(a5.reduce(_+_))

    //reduceByKey
    var r4=(a4.reduceByKey(_+_))
    r4.foreach(println)
//    for((a,b)<-a4)  println("a4:("+a+","+b+")")
//    for((a,b)<-r4)  println("r4:("+a+","+b+")")
    //    for(i<-r4)  i.mkString(",")  i
    // sortBy
    val s4=r4.sortBy(_._2)
    s4.foreach(println)
//    for((a,b)<-s4)  println("s4:("+a+","+b+")")

    //take
    //    for (j,k)=a4.take(1)
    //    println("s4:("+a+","+b+")")
//    for((a,b)<-a4.take(1))  println("t4:("+a+","+b+")")

    //SaveAsTextFile
    val iString=new SimpleDateFormat("yyyyMMddHHmmssSSS").format(new Date() )
    val soutput="hdfs://hadoop102:9000/output/"+iString;  //换成真实IP
    s4.saveAsTextFile(soutput)

    spark.stop()
  }
}
"/Applications/IntelliJ IDEA.app/Contents/jbr/Contents/Home/bin/java" "-javaagent:/Applications/IntelliJ IDEA.app/Contents/lib/idea_rt.jar=60623:/Applications/IntelliJ IDEA.app/Contents/bin" -Dfile.encoding=UTF-8 -classpath /Users/andrew/IdeaProjects/scala_from_scratch/target/classes:/Users/andrew/.ivy2/cache/org.scala-lang/scala-reflect/jars/scala-reflect-2.12.10.jar:/Users/andrew/.ivy2/cache/org.scala-lang/scala-library/jars/scala-library-2.12.10.jar:/Users/andrew/.m2/repository/org/apache/spark/spark-core_2.12/3.0.0/spark-core_2.12-3.0.0.jar:/Users/andrew/.m2/repository/com/thoughtworks/paranamer/paranamer/2.8/paranamer-2.8.jar:/Users/andrew/.m2/repository/org/apache/avro/avro/1.8.2/avro-1.8.2.jar:/Users/andrew/.m2/repository/org/codehaus/jackson/jackson-core-asl/1.9.13/jackson-core-asl-1.9.13.jar:/Users/andrew/.m2/repository/org/codehaus/jackson/jackson-mapper-asl/1.9.13/jackson-mapper-asl-1.9.13.jar:/Users/andrew/.m2/repository/org/apache/commons/commons-compress/1.8.1/commons-compress-1.8.1.jar:/Users/andrew/.m2/repository/org/tukaani/xz/1.5/xz-1.5.jar:/Users/andrew/.m2/repository/org/apache/avro/avro-mapred/1.8.2/avro-mapred-1.8.2-hadoop2.jar:/Users/andrew/.m2/repository/org/apache/avro/avro-ipc/1.8.2/avro-ipc-1.8.2.jar:/Users/andrew/.m2/repository/commons-codec/commons-codec/1.9/commons-codec-1.9.jar:/Users/andrew/.m2/repository/com/twitter/chill_2.12/0.9.5/chill_2.12-0.9.5.jar:/Users/andrew/.m2/repository/com/esotericsoftware/kryo-shaded/4.0.2/kryo-shaded-4.0.2.jar:/Users/andrew/.m2/repository/com/esotericsoftware/minlog/1.3.0/minlog-1.3.0.jar:/Users/andrew/.m2/repository/org/objenesis/objenesis/2.5.1/objenesis-2.5.1.jar:/Users/andrew/.m2/repository/com/twitter/chill-java/0.9.5/chill-java-0.9.5.jar:/Users/andrew/.m2/repository/org/apache/xbean/xbean-asm7-shaded/4.15/xbean-asm7-shaded-4.15.jar:/Users/andrew/.m2/repository/org/apache/hadoop/hadoop-client/2.7.4/hadoop-client-2.7.4.jar:/Users/andrew/.m2/repository/org/apache/hadoop/hadoop-common/2.7.4/hadoop-common-2.7.4.jar:/Users/andrew/.m2/repository/commons-cli/commons-cli/1.2/commons-cli-1.2.jar:/Users/andrew/.m2/repository/xmlenc/xmlenc/0.52/xmlenc-0.52.jar:/Users/andrew/.m2/repository/commons-httpclient/commons-httpclient/3.1/commons-httpclient-3.1.jar:/Users/andrew/.m2/repository/commons-io/commons-io/2.4/commons-io-2.4.jar:/Users/andrew/.m2/repository/commons-collections/commons-collections/3.2.2/commons-collections-3.2.2.jar:/Users/andrew/.m2/repository/org/mortbay/jetty/jetty-sslengine/6.1.26/jetty-sslengine-6.1.26.jar:/Users/andrew/.m2/repository/javax/servlet/jsp/jsp-api/2.1/jsp-api-2.1.jar:/Users/andrew/.m2/repository/commons-lang/commons-lang/2.6/commons-lang-2.6.jar:/Users/andrew/.m2/repository/commons-configuration/commons-configuration/1.6/commons-configuration-1.6.jar:/Users/andrew/.m2/repository/commons-digester/commons-digester/1.8/commons-digester-1.8.jar:/Users/andrew/.m2/repository/commons-beanutils/commons-beanutils/1.7.0/commons-beanutils-1.7.0.jar:/Users/andrew/.m2/repository/com/google/protobuf/protobuf-java/2.5.0/protobuf-java-2.5.0.jar:/Users/andrew/.m2/repository/com/google/code/gson/gson/2.2.4/gson-2.2.4.jar:/Users/andrew/.m2/repository/org/apache/hadoop/hadoop-auth/2.7.4/hadoop-auth-2.7.4.jar:/Users/andrew/.m2/repository/org/apache/httpcomponents/httpclient/4.2.5/httpclient-4.2.5.jar:/Users/andrew/.m2/repository/org/apache/httpcomponents/httpcore/4.2.4/httpcore-4.2.4.jar:/Users/andrew/.m2/repository/org/apache/directory/server/apacheds-kerberos-codec/2.0.0-M15/apacheds-kerberos-codec-2.0.0-M15.jar:/Users/andrew/.m2/repository/org/apache/directory/server/apacheds-i18n/2.0.0-M15/apacheds-i18n-2.0.0-M15.jar:/Users/andrew/.m2/repository/org/apache/directory/api/api-asn1-api/1.0.0-M20/api-asn1-api-1.0.0-M20.jar:/Users/andrew/.m2/repository/org/apache/directory/api/api-util/1.0.0-M20/api-util-1.0.0-M20.jar:/Users/andrew/.m2/repository/org/apache/curator/curator-client/2.7.1/curator-client-2.7.1.jar:/Users/andrew/.m2/repository/org/apache/htrace/htrace-core/3.1.0-incubating/htrace-core-3.1.0-incubating.jar:/Users/andrew/.m2/repository/org/apache/hadoop/hadoop-hdfs/2.7.4/hadoop-hdfs-2.7.4.jar:/Users/andrew/.m2/repository/org/mortbay/jetty/jetty-util/6.1.26/jetty-util-6.1.26.jar:/Users/andrew/.m2/repository/xerces/xercesImpl/2.9.1/xercesImpl-2.9.1.jar:/Users/andrew/.m2/repository/xml-apis/xml-apis/1.3.04/xml-apis-1.3.04.jar:/Users/andrew/.m2/repository/org/apache/hadoop/hadoop-mapreduce-client-app/2.7.4/hadoop-mapreduce-client-app-2.7.4.jar:/Users/andrew/.m2/repository/org/apache/hadoop/hadoop-mapreduce-client-common/2.7.4/hadoop-mapreduce-client-common-2.7.4.jar:/Users/andrew/.m2/repository/org/apache/hadoop/hadoop-yarn-client/2.7.4/hadoop-yarn-client-2.7.4.jar:/Users/andrew/.m2/repository/org/apache/hadoop/hadoop-yarn-server-common/2.7.4/hadoop-yarn-server-common-2.7.4.jar:/Users/andrew/.m2/repository/org/apache/hadoop/hadoop-mapreduce-client-shuffle/2.7.4/hadoop-mapreduce-client-shuffle-2.7.4.jar:/Users/andrew/.m2/repository/org/apache/hadoop/hadoop-yarn-api/2.7.4/hadoop-yarn-api-2.7.4.jar:/Users/andrew/.m2/repository/org/apache/hadoop/hadoop-mapreduce-client-core/2.7.4/hadoop-mapreduce-client-core-2.7.4.jar:/Users/andrew/.m2/repository/org/apache/hadoop/hadoop-yarn-common/2.7.4/hadoop-yarn-common-2.7.4.jar:/Users/andrew/.m2/repository/javax/xml/bind/jaxb-api/2.2.2/jaxb-api-2.2.2.jar:/Users/andrew/.m2/repository/javax/xml/stream/stax-api/1.0-2/stax-api-1.0-2.jar:/Users/andrew/.m2/repository/org/codehaus/jackson/jackson-jaxrs/1.9.13/jackson-jaxrs-1.9.13.jar:/Users/andrew/.m2/repository/org/codehaus/jackson/jackson-xc/1.9.13/jackson-xc-1.9.13.jar:/Users/andrew/.m2/repository/org/apache/hadoop/hadoop-mapreduce-client-jobclient/2.7.4/hadoop-mapreduce-client-jobclient-2.7.4.jar:/Users/andrew/.m2/repository/org/apache/hadoop/hadoop-annotations/2.7.4/hadoop-annotations-2.7.4.jar:/Users/andrew/.m2/repository/org/apache/spark/spark-launcher_2.12/3.0.0/spark-launcher_2.12-3.0.0.jar:/Users/andrew/.m2/repository/org/apache/spark/spark-kvstore_2.12/3.0.0/spark-kvstore_2.12-3.0.0.jar:/Users/andrew/.m2/repository/org/fusesource/leveldbjni/leveldbjni-all/1.8/leveldbjni-all-1.8.jar:/Users/andrew/.m2/repository/com/fasterxml/jackson/core/jackson-core/2.10.0/jackson-core-2.10.0.jar:/Users/andrew/.m2/repository/com/fasterxml/jackson/core/jackson-annotations/2.10.0/jackson-annotations-2.10.0.jar:/Users/andrew/.m2/repository/org/apache/spark/spark-network-common_2.12/3.0.0/spark-network-common_2.12-3.0.0.jar:/Users/andrew/.m2/repository/org/apache/spark/spark-network-shuffle_2.12/3.0.0/spark-network-shuffle_2.12-3.0.0.jar:/Users/andrew/.m2/repository/org/apache/spark/spark-unsafe_2.12/3.0.0/spark-unsafe_2.12-3.0.0.jar:/Users/andrew/.m2/repository/javax/activation/activation/1.1.1/activation-1.1.1.jar:/Users/andrew/.m2/repository/org/apache/curator/curator-recipes/2.7.1/curator-recipes-2.7.1.jar:/Users/andrew/.m2/repository/org/apache/curator/curator-framework/2.7.1/curator-framework-2.7.1.jar:/Users/andrew/.m2/repository/com/google/guava/guava/16.0.1/guava-16.0.1.jar:/Users/andrew/.m2/repository/org/apache/zookeeper/zookeeper/3.4.14/zookeeper-3.4.14.jar:/Users/andrew/.m2/repository/org/apache/yetus/audience-annotations/0.5.0/audience-annotations-0.5.0.jar:/Users/andrew/.m2/repository/javax/servlet/javax.servlet-api/3.1.0/javax.servlet-api-3.1.0.jar:/Users/andrew/.m2/repository/org/apache/commons/commons-lang3/3.9/commons-lang3-3.9.jar:/Users/andrew/.m2/repository/org/apache/commons/commons-math3/3.4.1/commons-math3-3.4.1.jar:/Users/andrew/.m2/repository/org/apache/commons/commons-text/1.6/commons-text-1.6.jar:/Users/andrew/.m2/repository/com/google/code/findbugs/jsr305/3.0.0/jsr305-3.0.0.jar:/Users/andrew/.m2/repository/org/slf4j/slf4j-api/1.7.30/slf4j-api-1.7.30.jar:/Users/andrew/.m2/repository/org/slf4j/jul-to-slf4j/1.7.30/jul-to-slf4j-1.7.30.jar:/Users/andrew/.m2/repository/org/slf4j/jcl-over-slf4j/1.7.30/jcl-over-slf4j-1.7.30.jar:/Users/andrew/.m2/repository/log4j/log4j/1.2.17/log4j-1.2.17.jar:/Users/andrew/.m2/repository/org/slf4j/slf4j-log4j12/1.7.30/slf4j-log4j12-1.7.30.jar:/Users/andrew/.m2/repository/com/ning/compress-lzf/1.0.3/compress-lzf-1.0.3.jar:/Users/andrew/.m2/repository/org/xerial/snappy/snappy-java/1.1.7.5/snappy-java-1.1.7.5.jar:/Users/andrew/.m2/repository/org/lz4/lz4-java/1.7.1/lz4-java-1.7.1.jar:/Users/andrew/.m2/repository/com/github/luben/zstd-jni/1.4.4-3/zstd-jni-1.4.4-3.jar:/Users/andrew/.m2/repository/org/roaringbitmap/RoaringBitmap/0.7.45/RoaringBitmap-0.7.45.jar:/Users/andrew/.m2/repository/org/roaringbitmap/shims/0.7.45/shims-0.7.45.jar:/Users/andrew/.m2/repository/commons-net/commons-net/3.1/commons-net-3.1.jar:/Users/andrew/.m2/repository/org/scala-lang/modules/scala-xml_2.12/1.2.0/scala-xml_2.12-1.2.0.jar:/Users/andrew/.m2/repository/org/scala-lang/scala-library/2.12.10/scala-library-2.12.10.jar:/Users/andrew/.m2/repository/org/scala-lang/scala-reflect/2.12.10/scala-reflect-2.12.10.jar:/Users/andrew/.m2/repository/org/json4s/json4s-jackson_2.12/3.6.6/json4s-jackson_2.12-3.6.6.jar:/Users/andrew/.m2/repository/org/json4s/json4s-core_2.12/3.6.6/json4s-core_2.12-3.6.6.jar:/Users/andrew/.m2/repository/org/json4s/json4s-ast_2.12/3.6.6/json4s-ast_2.12-3.6.6.jar:/Users/andrew/.m2/repository/org/json4s/json4s-scalap_2.12/3.6.6/json4s-scalap_2.12-3.6.6.jar:/Users/andrew/.m2/repository/org/glassfish/jersey/core/jersey-client/2.30/jersey-client-2.30.jar:/Users/andrew/.m2/repository/jakarta/ws/rs/jakarta.ws.rs-api/2.1.6/jakarta.ws.rs-api-2.1.6.jar:/Users/andrew/.m2/repository/org/glassfish/hk2/external/jakarta.inject/2.6.1/jakarta.inject-2.6.1.jar:/Users/andrew/.m2/repository/org/glassfish/jersey/core/jersey-common/2.30/jersey-common-2.30.jar:/Users/andrew/.m2/repository/jakarta/annotation/jakarta.annotation-api/1.3.5/jakarta.annotation-api-1.3.5.jar:/Users/andrew/.m2/repository/org/glassfish/hk2/osgi-resource-locator/1.0.3/osgi-resource-locator-1.0.3.jar:/Users/andrew/.m2/repository/org/glassfish/jersey/core/jersey-server/2.30/jersey-server-2.30.jar:/Users/andrew/.m2/repository/org/glassfish/jersey/media/jersey-media-jaxb/2.30/jersey-media-jaxb-2.30.jar:/Users/andrew/.m2/repository/jakarta/validation/jakarta.validation-api/2.0.2/jakarta.validation-api-2.0.2.jar:/Users/andrew/.m2/repository/org/glassfish/jersey/containers/jersey-container-servlet/2.30/jersey-container-servlet-2.30.jar:/Users/andrew/.m2/repository/org/glassfish/jersey/containers/jersey-container-servlet-core/2.30/jersey-container-servlet-core-2.30.jar:/Users/andrew/.m2/repository/org/glassfish/jersey/inject/jersey-hk2/2.30/jersey-hk2-2.30.jar:/Users/andrew/.m2/repository/org/glassfish/hk2/hk2-locator/2.6.1/hk2-locator-2.6.1.jar:/Users/andrew/.m2/repository/org/glassfish/hk2/external/aopalliance-repackaged/2.6.1/aopalliance-repackaged-2.6.1.jar:/Users/andrew/.m2/repository/org/glassfish/hk2/hk2-api/2.6.1/hk2-api-2.6.1.jar:/Users/andrew/.m2/repository/org/glassfish/hk2/hk2-utils/2.6.1/hk2-utils-2.6.1.jar:/Users/andrew/.m2/repository/org/javassist/javassist/3.25.0-GA/javassist-3.25.0-GA.jar:/Users/andrew/.m2/repository/io/netty/netty-all/4.1.47.Final/netty-all-4.1.47.Final.jar:/Users/andrew/.m2/repository/com/clearspring/analytics/stream/2.9.6/stream-2.9.6.jar:/Users/andrew/.m2/repository/io/dropwizard/metrics/metrics-core/4.1.1/metrics-core-4.1.1.jar:/Users/andrew/.m2/repository/io/dropwizard/metrics/metrics-jvm/4.1.1/metrics-jvm-4.1.1.jar:/Users/andrew/.m2/repository/io/dropwizard/metrics/metrics-json/4.1.1/metrics-json-4.1.1.jar:/Users/andrew/.m2/repository/io/dropwizard/metrics/metrics-graphite/4.1.1/metrics-graphite-4.1.1.jar:/Users/andrew/.m2/repository/io/dropwizard/metrics/metrics-jmx/4.1.1/metrics-jmx-4.1.1.jar:/Users/andrew/.m2/repository/com/fasterxml/jackson/core/jackson-databind/2.10.0/jackson-databind-2.10.0.jar:/Users/andrew/.m2/repository/com/fasterxml/jackson/module/jackson-module-scala_2.12/2.10.0/jackson-module-scala_2.12-2.10.0.jar:/Users/andrew/.m2/repository/com/fasterxml/jackson/module/jackson-module-paranamer/2.10.0/jackson-module-paranamer-2.10.0.jar:/Users/andrew/.m2/repository/org/apache/ivy/ivy/2.4.0/ivy-2.4.0.jar:/Users/andrew/.m2/repository/oro/oro/2.0.8/oro-2.0.8.jar:/Users/andrew/.m2/repository/net/razorvine/pyrolite/4.30/pyrolite-4.30.jar:/Users/andrew/.m2/repository/net/sf/py4j/py4j/0.10.9/py4j-0.10.9.jar:/Users/andrew/.m2/repository/org/apache/spark/spark-tags_2.12/3.0.0/spark-tags_2.12-3.0.0.jar:/Users/andrew/.m2/repository/org/apache/commons/commons-crypto/1.0.0/commons-crypto-1.0.0.jar:/Users/andrew/.m2/repository/org/spark-project/spark/unused/1.0.0/unused-1.0.0.jar Reduce_demo
WARNING: An illegal reflective access operation has occurred
WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/Users/andrew/.m2/repository/org/apache/spark/spark-unsafe_2.12/3.0.0/spark-unsafe_2.12-3.0.0.jar) to constructor java.nio.DirectByteBuffer(long,int)
WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform
WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations
WARNING: All illegal access operations will be denied in a future release
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
20/09/01 00:42:56 INFO SparkContext: Running Spark version 3.0.0
20/09/01 00:42:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
20/09/01 00:42:57 INFO ResourceUtils: ==============================================================
20/09/01 00:42:57 INFO ResourceUtils: Resources for spark.driver:

20/09/01 00:42:57 INFO ResourceUtils: ==============================================================
20/09/01 00:42:57 INFO SparkContext: Submitted application: Transformation1
20/09/01 00:42:57 INFO SecurityManager: Changing view acls to: andrew
20/09/01 00:42:57 INFO SecurityManager: Changing modify acls to: andrew
20/09/01 00:42:57 INFO SecurityManager: Changing view acls groups to: 
20/09/01 00:42:57 INFO SecurityManager: Changing modify acls groups to: 
20/09/01 00:42:57 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users  with view permissions: Set(andrew); groups with view permissions: Set(); users  with modify permissions: Set(andrew); groups with modify permissions: Set()
20/09/01 00:42:57 INFO Utils: Successfully started service 'sparkDriver' on port 60638.
20/09/01 00:42:57 INFO SparkEnv: Registering MapOutputTracker
20/09/01 00:42:57 INFO SparkEnv: Registering BlockManagerMaster
20/09/01 00:42:57 INFO BlockManagerMasterEndpoint: Using org.apache.spark.storage.DefaultTopologyMapper for getting topology information
20/09/01 00:42:57 INFO BlockManagerMasterEndpoint: BlockManagerMasterEndpoint up
20/09/01 00:42:57 INFO SparkEnv: Registering BlockManagerMasterHeartbeat
20/09/01 00:42:57 INFO DiskBlockManager: Created local directory at /private/var/folders/s8/_5116jwj03l5sdj__s1jv4pr0000gn/T/blockmgr-f148079d-2758-421f-b940-2dd1782d1887
20/09/01 00:42:57 INFO MemoryStore: MemoryStore started with capacity 9.4 GiB
20/09/01 00:42:57 INFO SparkEnv: Registering OutputCommitCoordinator
20/09/01 00:42:57 INFO Utils: Successfully started service 'SparkUI' on port 4040.
20/09/01 00:42:57 INFO SparkUI: Bound SparkUI to 0.0.0.0, and started at http://imac-pro:4040
20/09/01 00:42:58 INFO Executor: Starting executor ID driver on host imac-pro
20/09/01 00:42:58 INFO Utils: Successfully started service 'org.apache.spark.network.netty.NettyBlockTransferService' on port 60643.
20/09/01 00:42:58 INFO NettyBlockTransferService: Server created on imac-pro:60643
20/09/01 00:42:58 INFO BlockManager: Using org.apache.spark.storage.RandomBlockReplicationPolicy for block replication policy
20/09/01 00:42:58 INFO BlockManagerMaster: Registering BlockManager BlockManagerId(driver, imac-pro, 60643, None)
20/09/01 00:42:58 INFO BlockManagerMasterEndpoint: Registering block manager imac-pro:60643 with 9.4 GiB RAM, BlockManagerId(driver, imac-pro, 60643, None)
20/09/01 00:42:58 INFO BlockManagerMaster: Registered BlockManager BlockManagerId(driver, imac-pro, 60643, None)
20/09/01 00:42:58 INFO BlockManager: Initialized BlockManager: BlockManagerId(driver, imac-pro, 60643, None)
20/09/01 00:42:58 INFO SparkContext: Starting job: lookup at Reduce_demo.scala:14
20/09/01 00:42:58 INFO DAGScheduler: Got job 0 (lookup at Reduce_demo.scala:14) with 1 output partitions
20/09/01 00:42:58 INFO DAGScheduler: Final stage: ResultStage 0 (lookup at Reduce_demo.scala:14)
20/09/01 00:42:58 INFO DAGScheduler: Parents of final stage: List()
20/09/01 00:42:58 INFO DAGScheduler: Missing parents: List()
20/09/01 00:42:58 INFO DAGScheduler: Submitting ResultStage 0 (MapPartitionsRDD[5] at lookup at Reduce_demo.scala:14), which has no missing parents
20/09/01 00:42:58 INFO MemoryStore: Block broadcast_0 stored as values in memory (estimated size 3.5 KiB, free 9.4 GiB)
20/09/01 00:42:58 INFO MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 1986.0 B, free 9.4 GiB)
20/09/01 00:42:58 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on imac-pro:60643 (size: 1986.0 B, free: 9.4 GiB)
20/09/01 00:42:58 INFO SparkContext: Created broadcast 0 from broadcast at DAGScheduler.scala:1200
20/09/01 00:42:58 INFO DAGScheduler: Submitting 1 missing tasks from ResultStage 0 (MapPartitionsRDD[5] at lookup at Reduce_demo.scala:14) (first 15 tasks are for partitions Vector(0))
20/09/01 00:42:58 INFO TaskSchedulerImpl: Adding task set 0.0 with 1 tasks
20/09/01 00:42:59 INFO TaskSetManager: Starting task 0.0 in stage 0.0 (TID 0, imac-pro, executor driver, partition 0, PROCESS_LOCAL, 7545 bytes)
20/09/01 00:42:59 INFO Executor: Running task 0.0 in stage 0.0 (TID 0)
20/09/01 00:42:59 INFO Executor: Finished task 0.0 in stage 0.0 (TID 0). 828 bytes result sent to driver
20/09/01 00:42:59 INFO TaskSetManager: Finished task 0.0 in stage 0.0 (TID 0) in 108 ms on imac-pro (executor driver) (1/1)
20/09/01 00:42:59 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool 
20/09/01 00:42:59 INFO DAGScheduler: ResultStage 0 (lookup at Reduce_demo.scala:14) finished in 0.527 s
20/09/01 00:42:59 INFO DAGScheduler: Job 0 is finished. Cancelling potential speculative or zombie tasks for this job
20/09/01 00:42:59 INFO TaskSchedulerImpl: Killing all running tasks in stage 0: Stage finished
20/09/01 00:42:59 INFO DAGScheduler: Job 0 finished: lookup at Reduce_demo.scala:14, took 0.564556 s
1
2
20/09/01 00:42:59 INFO SparkContext: Starting job: reduce at Reduce_demo.scala:19
20/09/01 00:42:59 INFO DAGScheduler: Got job 1 (reduce at Reduce_demo.scala:19) with 1 output partitions
20/09/01 00:42:59 INFO DAGScheduler: Final stage: ResultStage 1 (reduce at Reduce_demo.scala:19)
20/09/01 00:42:59 INFO DAGScheduler: Parents of final stage: List()
20/09/01 00:42:59 INFO DAGScheduler: Missing parents: List()
20/09/01 00:42:59 INFO DAGScheduler: Submitting ResultStage 1 (ParallelCollectionRDD[6] at parallelize at Reduce_demo.scala:18), which has no missing parents
20/09/01 00:42:59 INFO MemoryStore: Block broadcast_1 stored as values in memory (estimated size 2.2 KiB, free 9.4 GiB)
20/09/01 00:42:59 INFO MemoryStore: Block broadcast_1_piece0 stored as bytes in memory (estimated size 1385.0 B, free 9.4 GiB)
20/09/01 00:42:59 INFO BlockManagerInfo: Added broadcast_1_piece0 in memory on imac-pro:60643 (size: 1385.0 B, free: 9.4 GiB)
20/09/01 00:42:59 INFO SparkContext: Created broadcast 1 from broadcast at DAGScheduler.scala:1200
20/09/01 00:42:59 INFO DAGScheduler: Submitting 1 missing tasks from ResultStage 1 (ParallelCollectionRDD[6] at parallelize at Reduce_demo.scala:18) (first 15 tasks are for partitions Vector(0))
20/09/01 00:42:59 INFO TaskSchedulerImpl: Adding task set 1.0 with 1 tasks
20/09/01 00:42:59 INFO TaskSetManager: Starting task 0.0 in stage 1.0 (TID 1, imac-pro, executor driver, partition 0, PROCESS_LOCAL, 7294 bytes)
20/09/01 00:42:59 INFO Executor: Running task 0.0 in stage 1.0 (TID 1)
20/09/01 00:42:59 INFO Executor: Finished task 0.0 in stage 1.0 (TID 1). 957 bytes result sent to driver
20/09/01 00:42:59 INFO TaskSetManager: Finished task 0.0 in stage 1.0 (TID 1) in 12 ms on imac-pro (executor driver) (1/1)
20/09/01 00:42:59 INFO TaskSchedulerImpl: Removed TaskSet 1.0, whose tasks have all completed, from pool 
20/09/01 00:42:59 INFO DAGScheduler: ResultStage 1 (reduce at Reduce_demo.scala:19) finished in 0.022 s
20/09/01 00:42:59 INFO DAGScheduler: Job 1 is finished. Cancelling potential speculative or zombie tasks for this job
20/09/01 00:42:59 INFO TaskSchedulerImpl: Killing all running tasks in stage 1: Stage finished
20/09/01 00:42:59 INFO DAGScheduler: Job 1 finished: reduce at Reduce_demo.scala:19, took 0.024471 s
15
20/09/01 00:42:59 INFO SparkContext: Starting job: foreach at Reduce_demo.scala:23
20/09/01 00:42:59 INFO DAGScheduler: Registering RDD 3 (parallelize at Reduce_demo.scala:13) as input to shuffle 0
20/09/01 00:42:59 INFO DAGScheduler: Got job 2 (foreach at Reduce_demo.scala:23) with 1 output partitions
20/09/01 00:42:59 INFO DAGScheduler: Final stage: ResultStage 3 (foreach at Reduce_demo.scala:23)
20/09/01 00:42:59 INFO DAGScheduler: Parents of final stage: List(ShuffleMapStage 2)
20/09/01 00:42:59 INFO DAGScheduler: Missing parents: List(ShuffleMapStage 2)
20/09/01 00:42:59 INFO DAGScheduler: Submitting ShuffleMapStage 2 (ParallelCollectionRDD[3] at parallelize at Reduce_demo.scala:13), which has no missing parents
20/09/01 00:42:59 INFO MemoryStore: Block broadcast_2 stored as values in memory (estimated size 4.2 KiB, free 9.4 GiB)
20/09/01 00:42:59 INFO MemoryStore: Block broadcast_2_piece0 stored as bytes in memory (estimated size 2.6 KiB, free 9.4 GiB)
20/09/01 00:42:59 INFO BlockManagerInfo: Added broadcast_2_piece0 in memory on imac-pro:60643 (size: 2.6 KiB, free: 9.4 GiB)
20/09/01 00:42:59 INFO SparkContext: Created broadcast 2 from broadcast at DAGScheduler.scala:1200
20/09/01 00:42:59 INFO DAGScheduler: Submitting 1 missing tasks from ShuffleMapStage 2 (ParallelCollectionRDD[3] at parallelize at Reduce_demo.scala:13) (first 15 tasks are for partitions Vector(0))
20/09/01 00:42:59 INFO TaskSchedulerImpl: Adding task set 2.0 with 1 tasks
20/09/01 00:42:59 INFO TaskSetManager: Starting task 0.0 in stage 2.0 (TID 2, imac-pro, executor driver, partition 0, PROCESS_LOCAL, 7534 bytes)
20/09/01 00:42:59 INFO Executor: Running task 0.0 in stage 2.0 (TID 2)
20/09/01 00:42:59 INFO Executor: Finished task 0.0 in stage 2.0 (TID 2). 1157 bytes result sent to driver
20/09/01 00:42:59 INFO TaskSetManager: Finished task 0.0 in stage 2.0 (TID 2) in 43 ms on imac-pro (executor driver) (1/1)
20/09/01 00:42:59 INFO TaskSchedulerImpl: Removed TaskSet 2.0, whose tasks have all completed, from pool 
20/09/01 00:42:59 INFO DAGScheduler: ShuffleMapStage 2 (parallelize at Reduce_demo.scala:13) finished in 0.075 s
20/09/01 00:42:59 INFO DAGScheduler: looking for newly runnable stages
20/09/01 00:42:59 INFO DAGScheduler: running: Set()
20/09/01 00:42:59 INFO DAGScheduler: waiting: Set(ResultStage 3)
20/09/01 00:42:59 INFO DAGScheduler: failed: Set()
20/09/01 00:42:59 INFO DAGScheduler: Submitting ResultStage 3 (ShuffledRDD[7] at reduceByKey at Reduce_demo.scala:22), which has no missing parents
20/09/01 00:42:59 INFO MemoryStore: Block broadcast_3 stored as values in memory (estimated size 4.3 KiB, free 9.4 GiB)
20/09/01 00:42:59 INFO MemoryStore: Block broadcast_3_piece0 stored as bytes in memory (estimated size 2.5 KiB, free 9.4 GiB)
20/09/01 00:42:59 INFO BlockManagerInfo: Added broadcast_3_piece0 in memory on imac-pro:60643 (size: 2.5 KiB, free: 9.4 GiB)
20/09/01 00:42:59 INFO SparkContext: Created broadcast 3 from broadcast at DAGScheduler.scala:1200
20/09/01 00:42:59 INFO DAGScheduler: Submitting 1 missing tasks from ResultStage 3 (ShuffledRDD[7] at reduceByKey at Reduce_demo.scala:22) (first 15 tasks are for partitions Vector(0))
20/09/01 00:42:59 INFO TaskSchedulerImpl: Adding task set 3.0 with 1 tasks
20/09/01 00:42:59 INFO TaskSetManager: Starting task 0.0 in stage 3.0 (TID 3, imac-pro, executor driver, partition 0, NODE_LOCAL, 7143 bytes)
20/09/01 00:42:59 INFO Executor: Running task 0.0 in stage 3.0 (TID 3)
20/09/01 00:42:59 INFO BlockManagerInfo: Removed broadcast_0_piece0 on imac-pro:60643 in memory (size: 1986.0 B, free: 9.4 GiB)
20/09/01 00:42:59 INFO ShuffleBlockFetcherIterator: Getting 1 (66.0 B) non-empty blocks including 1 (66.0 B) local and 0 (0.0 B) host-local and 0 (0.0 B) remote blocks
20/09/01 00:42:59 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 6 ms
20/09/01 00:42:59 INFO BlockManagerInfo: Removed broadcast_2_piece0 on imac-pro:60643 in memory (size: 2.6 KiB, free: 9.4 GiB)
20/09/01 00:42:59 INFO BlockManagerInfo: Removed broadcast_1_piece0 on imac-pro:60643 in memory (size: 1385.0 B, free: 9.4 GiB)
(d,1)
(a,3)
(b,6)
(c,1)
20/09/01 00:42:59 INFO Executor: Finished task 0.0 in stage 3.0 (TID 3). 1310 bytes result sent to driver
20/09/01 00:42:59 INFO TaskSetManager: Finished task 0.0 in stage 3.0 (TID 3) in 59 ms on imac-pro (executor driver) (1/1)
20/09/01 00:42:59 INFO TaskSchedulerImpl: Removed TaskSet 3.0, whose tasks have all completed, from pool 
20/09/01 00:42:59 INFO DAGScheduler: ResultStage 3 (foreach at Reduce_demo.scala:23) finished in 0.071 s
20/09/01 00:42:59 INFO DAGScheduler: Job 2 is finished. Cancelling potential speculative or zombie tasks for this job
20/09/01 00:42:59 INFO TaskSchedulerImpl: Killing all running tasks in stage 3: Stage finished
20/09/01 00:42:59 INFO DAGScheduler: Job 2 finished: foreach at Reduce_demo.scala:23, took 0.279078 s
20/09/01 00:42:59 INFO SparkContext: Starting job: foreach at Reduce_demo.scala:29
20/09/01 00:42:59 INFO DAGScheduler: Registering RDD 8 (sortBy at Reduce_demo.scala:28) as input to shuffle 1
20/09/01 00:42:59 INFO DAGScheduler: Got job 3 (foreach at Reduce_demo.scala:29) with 1 output partitions
20/09/01 00:42:59 INFO DAGScheduler: Final stage: ResultStage 6 (foreach at Reduce_demo.scala:29)
20/09/01 00:42:59 INFO DAGScheduler: Parents of final stage: List(ShuffleMapStage 5)
20/09/01 00:42:59 INFO DAGScheduler: Missing parents: List(ShuffleMapStage 5)
20/09/01 00:42:59 INFO DAGScheduler: Submitting ShuffleMapStage 5 (MapPartitionsRDD[8] at sortBy at Reduce_demo.scala:28), which has no missing parents
20/09/01 00:42:59 INFO MemoryStore: Block broadcast_4 stored as values in memory (estimated size 6.2 KiB, free 9.4 GiB)
20/09/01 00:42:59 INFO MemoryStore: Block broadcast_4_piece0 stored as bytes in memory (estimated size 3.6 KiB, free 9.4 GiB)
20/09/01 00:42:59 INFO BlockManagerInfo: Added broadcast_4_piece0 in memory on imac-pro:60643 (size: 3.6 KiB, free: 9.4 GiB)
20/09/01 00:42:59 INFO SparkContext: Created broadcast 4 from broadcast at DAGScheduler.scala:1200
20/09/01 00:42:59 INFO DAGScheduler: Submitting 1 missing tasks from ShuffleMapStage 5 (MapPartitionsRDD[8] at sortBy at Reduce_demo.scala:28) (first 15 tasks are for partitions Vector(0))
20/09/01 00:42:59 INFO TaskSchedulerImpl: Adding task set 5.0 with 1 tasks
20/09/01 00:42:59 INFO TaskSetManager: Starting task 0.0 in stage 5.0 (TID 4, imac-pro, executor driver, partition 0, NODE_LOCAL, 7132 bytes)
20/09/01 00:42:59 INFO Executor: Running task 0.0 in stage 5.0 (TID 4)
20/09/01 00:42:59 INFO ShuffleBlockFetcherIterator: Getting 1 (66.0 B) non-empty blocks including 1 (66.0 B) local and 0 (0.0 B) host-local and 0 (0.0 B) remote blocks
20/09/01 00:42:59 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 0 ms
20/09/01 00:42:59 INFO Executor: Finished task 0.0 in stage 5.0 (TID 4). 1458 bytes result sent to driver
20/09/01 00:42:59 INFO TaskSetManager: Finished task 0.0 in stage 5.0 (TID 4) in 24 ms on imac-pro (executor driver) (1/1)
20/09/01 00:42:59 INFO TaskSchedulerImpl: Removed TaskSet 5.0, whose tasks have all completed, from pool 
20/09/01 00:42:59 INFO DAGScheduler: ShuffleMapStage 5 (sortBy at Reduce_demo.scala:28) finished in 0.033 s
20/09/01 00:42:59 INFO DAGScheduler: looking for newly runnable stages
20/09/01 00:42:59 INFO DAGScheduler: running: Set()
20/09/01 00:42:59 INFO DAGScheduler: waiting: Set(ResultStage 6)
20/09/01 00:42:59 INFO DAGScheduler: failed: Set()
20/09/01 00:42:59 INFO DAGScheduler: Submitting ResultStage 6 (MapPartitionsRDD[10] at sortBy at Reduce_demo.scala:28), which has no missing parents
20/09/01 00:42:59 INFO MemoryStore: Block broadcast_5 stored as values in memory (estimated size 4.7 KiB, free 9.4 GiB)
20/09/01 00:42:59 INFO MemoryStore: Block broadcast_5_piece0 stored as bytes in memory (estimated size 2.6 KiB, free 9.4 GiB)
20/09/01 00:42:59 INFO BlockManagerInfo: Added broadcast_5_piece0 in memory on imac-pro:60643 (size: 2.6 KiB, free: 9.4 GiB)
20/09/01 00:42:59 INFO SparkContext: Created broadcast 5 from broadcast at DAGScheduler.scala:1200
20/09/01 00:42:59 INFO DAGScheduler: Submitting 1 missing tasks from ResultStage 6 (MapPartitionsRDD[10] at sortBy at Reduce_demo.scala:28) (first 15 tasks are for partitions Vector(0))
20/09/01 00:42:59 INFO TaskSchedulerImpl: Adding task set 6.0 with 1 tasks
20/09/01 00:42:59 INFO TaskSetManager: Starting task 0.0 in stage 6.0 (TID 5, imac-pro, executor driver, partition 0, NODE_LOCAL, 7143 bytes)
20/09/01 00:42:59 INFO Executor: Running task 0.0 in stage 6.0 (TID 5)
20/09/01 00:42:59 INFO ShuffleBlockFetcherIterator: Getting 1 (304.0 B) non-empty blocks including 1 (304.0 B) local and 0 (0.0 B) host-local and 0 (0.0 B) remote blocks
20/09/01 00:42:59 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 0 ms
(d,1)
(c,1)
(a,3)
(b,6)
20/09/01 00:42:59 INFO Executor: Finished task 0.0 in stage 6.0 (TID 5). 1267 bytes result sent to driver
20/09/01 00:42:59 INFO TaskSetManager: Finished task 0.0 in stage 6.0 (TID 5) in 13 ms on imac-pro (executor driver) (1/1)
20/09/01 00:42:59 INFO TaskSchedulerImpl: Removed TaskSet 6.0, whose tasks have all completed, from pool 
20/09/01 00:42:59 INFO DAGScheduler: ResultStage 6 (foreach at Reduce_demo.scala:29) finished in 0.019 s
20/09/01 00:42:59 INFO DAGScheduler: Job 3 is finished. Cancelling potential speculative or zombie tasks for this job
20/09/01 00:42:59 INFO TaskSchedulerImpl: Killing all running tasks in stage 6: Stage finished
20/09/01 00:42:59 INFO DAGScheduler: Job 3 finished: foreach at Reduce_demo.scala:29, took 0.058830 s
20/09/01 00:42:59 INFO deprecation: mapred.output.dir is deprecated. Instead, use mapreduce.output.fileoutputformat.outputdir
20/09/01 00:42:59 INFO HadoopMapRedCommitProtocol: Using output committer class org.apache.hadoop.mapred.FileOutputCommitter
20/09/01 00:42:59 INFO FileOutputCommitter: File Output Committer Algorithm version is 1
20/09/01 00:42:59 INFO SparkContext: Starting job: runJob at SparkHadoopWriter.scala:78
20/09/01 00:42:59 INFO DAGScheduler: Got job 4 (runJob at SparkHadoopWriter.scala:78) with 1 output partitions
20/09/01 00:42:59 INFO DAGScheduler: Final stage: ResultStage 9 (runJob at SparkHadoopWriter.scala:78)
20/09/01 00:42:59 INFO DAGScheduler: Parents of final stage: List(ShuffleMapStage 8)
20/09/01 00:42:59 INFO DAGScheduler: Missing parents: List()
20/09/01 00:42:59 INFO DAGScheduler: Submitting ResultStage 9 (MapPartitionsRDD[11] at saveAsTextFile at Reduce_demo.scala:40), which has no missing parents
20/09/01 00:42:59 INFO MemoryStore: Block broadcast_6 stored as values in memory (estimated size 75.1 KiB, free 9.4 GiB)
20/09/01 00:42:59 INFO MemoryStore: Block broadcast_6_piece0 stored as bytes in memory (estimated size 27.6 KiB, free 9.4 GiB)
20/09/01 00:42:59 INFO BlockManagerInfo: Added broadcast_6_piece0 in memory on imac-pro:60643 (size: 27.6 KiB, free: 9.4 GiB)
20/09/01 00:42:59 INFO SparkContext: Created broadcast 6 from broadcast at DAGScheduler.scala:1200
20/09/01 00:42:59 INFO DAGScheduler: Submitting 1 missing tasks from ResultStage 9 (MapPartitionsRDD[11] at saveAsTextFile at Reduce_demo.scala:40) (first 15 tasks are for partitions Vector(0))
20/09/01 00:42:59 INFO TaskSchedulerImpl: Adding task set 9.0 with 1 tasks
20/09/01 00:42:59 INFO TaskSetManager: Starting task 0.0 in stage 9.0 (TID 6, imac-pro, executor driver, partition 0, NODE_LOCAL, 7143 bytes)
20/09/01 00:42:59 INFO Executor: Running task 0.0 in stage 9.0 (TID 6)
20/09/01 00:43:00 INFO ShuffleBlockFetcherIterator: Getting 1 (304.0 B) non-empty blocks including 1 (304.0 B) local and 0 (0.0 B) host-local and 0 (0.0 B) remote blocks
20/09/01 00:43:00 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 0 ms
20/09/01 00:43:00 INFO HadoopMapRedCommitProtocol: Using output committer class org.apache.hadoop.mapred.FileOutputCommitter
20/09/01 00:43:00 INFO FileOutputCommitter: File Output Committer Algorithm version is 1
20/09/01 00:43:34 WARN DFSClient: Slow waitForAckedSeqno took 34622ms (threshold=30000ms)
20/09/01 00:43:34 INFO FileOutputCommitter: Saved output of task 'attempt_20200901004259_0011_m_000000_0' to hdfs://hadoop102:9000/output/20200901004259535/_temporary/0/task_20200901004259_0011_m_000000
20/09/01 00:43:34 INFO SparkHadoopMapRedUtil: attempt_20200901004259_0011_m_000000_0: Committed
20/09/01 00:43:34 INFO Executor: Finished task 0.0 in stage 9.0 (TID 6). 1588 bytes result sent to driver
20/09/01 00:43:34 INFO TaskSetManager: Finished task 0.0 in stage 9.0 (TID 6) in 34735 ms on imac-pro (executor driver) (1/1)
20/09/01 00:43:34 INFO TaskSchedulerImpl: Removed TaskSet 9.0, whose tasks have all completed, from pool 
20/09/01 00:43:34 INFO DAGScheduler: ResultStage 9 (runJob at SparkHadoopWriter.scala:78) finished in 34.752 s
20/09/01 00:43:34 INFO DAGScheduler: Job 4 is finished. Cancelling potential speculative or zombie tasks for this job
20/09/01 00:43:34 INFO TaskSchedulerImpl: Killing all running tasks in stage 9: Stage finished
20/09/01 00:43:34 INFO DAGScheduler: Job 4 finished: runJob at SparkHadoopWriter.scala:78, took 34.756045 s
20/09/01 00:43:34 INFO SparkHadoopWriter: Job job_20200901004259_0011 committed.
20/09/01 00:43:34 INFO SparkUI: Stopped Spark web UI at http://imac-pro:4040
20/09/01 00:43:34 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
20/09/01 00:43:34 INFO MemoryStore: MemoryStore cleared
20/09/01 00:43:34 INFO BlockManager: BlockManager stopped
20/09/01 00:43:34 INFO BlockManagerMaster: BlockManagerMaster stopped
20/09/01 00:43:34 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped!
20/09/01 00:43:34 INFO SparkContext: Successfully stopped SparkContext
20/09/01 00:43:34 INFO ShutdownHookManager: Shutdown hook called
20/09/01 00:43:34 INFO ShutdownHookManager: Deleting directory /private/var/folders/s8/_5116jwj03l5sdj__s1jv4pr0000gn/T/spark-64d0fb77-a952-4232-9653-6e38cc58bf41

Process finished with exit code 0

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
### 回答1: 以下是一个简单的Scala-Spark SQL查询和输出的示例: ```scala import org.apache.spark.sql.SparkSession object SparkSQLDemo { def main(args: Array[String]): Unit = { val spark = SparkSession.builder() .appName("SparkSQLDemo") .master("local[*]") .getOrCreate() // 读取CSV文件并创建DataFrame val df = spark.read .option("header", "true") .option("inferSchema", "true") .csv("path/to/csv/file") // 创建临时表 df.createOrReplaceTempView("people") // 执行SQL查询 val result = spark.sql("SELECT * FROM people WHERE age > 30") // 输出结果 result.show() spark.stop() } } ``` 这个示例演示了如何使用SparkSession对象读取CSV文件并创建DataFrame,然后将DataFrame注册为一个临时表,最后执行SQL查询并输出结果。在这个示例中,我们查询了所有年龄大于30岁的人的信息。 ### 回答2: Spark SQL是Spark中一个用于分布式数据处理的模块,它提供了一个用于结构化数据处理的SQL查询引擎。Scala是一种在JVM上运行的编程语言,它被广泛用于Spark中的开发。本文将介绍Scala-Spark SQL的查询和输出demo。 首先,我们需要导入Spark SQL的包: ```scala import org.apache.spark.sql.SparkSession ``` 然后,我们需要创建一个SparkSession对象: ```scala val spark = SparkSession.builder.appName("ScalaSparkSQLDemo").getOrCreate() ``` 接下来,我们可以通过以下代码从csv文件中读取数据并创建一个DataFrame对象: ```scala val df = spark.read.option("header", "true").option("delimiter", ",").csv("path/to/file.csv") ``` 上述代码中,“header”选项表示csv文件的第一行被视为列名,而“delimiter”选项表示csv文件中使用的字段分隔符。在这里,我们将csv文件的路径指定为“file.csv”。 接下来,我们可以执行SQL查询: ```scala df.createOrReplaceTempView("people") val result = spark.sql("SELECT * FROM people WHERE age > 21") ``` 上述代码中,“createOrReplaceTempView”方法将DataFrame注册为可临时使用的表,表名为“people”。然后我们执行SQL查询语句:“SELECT * FROM people WHERE age > 21”,筛选出age大于21的所有行。 最后,我们可以将结果输出到控制台: ```scala result.show() ``` 上述代码中,“show”方法将结果以表格形式输出到控制台。 整个Scala-Spark SQL查询输出demo的代码如下: ```scala import org.apache.spark.sql.SparkSession object ScalaSparkSQLDemo { def main(args: Array[String]): Unit = { val spark = SparkSession.builder.appName("ScalaSparkSQLDemo").getOrCreate() val df = spark.read.option("header", "true").option("delimiter", ",").csv("path/to/file.csv") df.createOrReplaceTempView("people") val result = spark.sql("SELECT * FROM people WHERE age > 21") result.show() spark.stop() } } ``` 注意,我们还需在代码结尾使用“spark.stop()”方法,以关闭SparkSession对象,释放资源。 以上是Scala-Spark SQL查询输出demo的详细说明,开发者们可以根据需求进行修改和应用。 ### 回答3: Scala-Spark是大数据处理和分析领域的重要框架之一,其提供了强大的数据分析和处理工具,其中包括Spark SQL。Spark SQL是将结构化数据和非结构化数据整合到一起进行分析和处理的一个模块,基于Spark Core数据处理引擎,并支持使用Scala、Java和Python等语言进行编程。 下面我们来看一个Scala-Spark SQL的查询和输出Demo: 首先需要导入相应的包,因为使用的是Spark 2.4版本,所以需要导入spark-sql的jar包: ```scala import org.apache.spark.sql.SparkSession ``` 然后创建一个SparkSession对象: ```scala val spark = SparkSession.builder().appName("Spark SQL Demo").master("local[*]").getOrCreate() ``` 接下来读入数据,构建DataFrame,比如我们在本地有一个people.csv文件,有两列数据:name和age,我们可以使用如下代码读取该文件构建DataFrame: ```scala val file = "people.csv" var peopleDF = spark.read.format("csv").option("header","true").load(file) ``` 然后我们对DataFrame数据进行一些操作,例如筛选出年龄大于30的人: ```scala var resultDF = peopleDF.filter("age > 30") ``` 接下来我们可以将结果输出到控制台,使用show()方法: ```scala resultDF.show() ``` 最后记得关闭SparkSession对象: ```scala spark.stop() ``` 以上就是Scala-Spark SQL查询与输出的基本Demo,这些代码可以帮助你了解在Scala中如何使用Spark SQL来进行数据查询和输出。当然,Spark SQL还有很多强大的功能和语法,需要大家自己去探索和学习。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值