spark程序入门-wordCount详解总结


import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;

import java.util.Arrays;


public class WordCount {
    public static void main(String[] args) {

        // 第一步:创建SparkConf对象,设置Spark应用的配置信息
        // 使用setMaster()可以设置Spark应用程序要连接的Spark集群的master节点的url
        // 但是如果设置为local则代表,在本地运行
        SparkConf conf = new SparkConf()
                .setAppName("WordCountLocal")
                .setMaster("local");

        // 第二步:创建JavaSparkContext对象
        // 在Spark中,SparkContext是Spark所有功能的一个入口,你无论是用java、scala,甚至是python编写
        // 都必须要有一个SparkContext,它的主要作用,包括初始化Spark应用程序所需的一些核心组件,包括
        // 调度器(DAGSchedule、TaskScheduler),还会去到Spark Master节点上进行注册,等等
        // 一句话,SparkContext,是Spark应用中,可以说是最最重要的一个对象
        // 但是呢,在Spark中,编写不同类型的Spark应用程序,使用的SparkContext是不同的,如果使用scala,
        // 使用的就是原生的SparkContext对象
        // 但是如果使用Java,那么就是JavaSparkContext对象
        // 如果是开发Spark SQL程序,那么就是SQLContext、HiveContext
        // 如果是开发Spark Streaming程序,那么就是它独有的SparkContext
        // 以此类推
        JavaSparkContext sc = new JavaSparkContext(conf);

        // 第三步:要针对输入源(hdfs文件、本地文件,等等),创建一个初始的RDD
        // 输入源中的数据会打散,分配到RDD的每个partition中,从而形成一个初始的分布式的数据集
        // 我们这里呢,因为是本地测试,所以呢,就是针对本地文件
        // SparkContext中,用于根据文件类型的输入源创建RDD的方法,叫做textFile()方法
        // 在Java中,创建的普通RDD,都叫做JavaRDD
        // 在这里呢,RDD中,有元素这种概念,如果是hdfs或者本地文件呢,创建的RDD,每一个元素就相当于
        // 是文件里的一行
        JavaRDD<String> lines = sc.textFile("C:/Users/Think/Desktop/spark.txt");

        // 第四步:对初始RDD进行transformation操作,也就是一些计算操作
        // 通常操作会通过创建function,并配合RDD的map、flatMap等算子来执行
        // function,通常,如果比较简单,则创建指定Function的匿名内部类
        // 但是如果function比较复杂,则会单独创建一个类,作为实现这个function接口的类

        // 先将每一行拆分成单个的单词
        // FlatMapFunction,有两个泛型参数,分别代表了输入和输出类型
        // 我们这里呢,输入肯定是String,因为是一行一行的文本,输出,其实也是String,因为是每一行的文本
        // 这里先简要介绍flatMap算子的作用,其实就是,将RDD的一个元素,给拆分成一个或多个元素
        JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String, String>() {

            private static final long serialVersionUID = 1L;

            @Override
            public Iterable<String> call(String line) throws Exception {
                return Arrays.asList(line.split(" "));
            }

        });

        // 接着,需要将每一个单词,映射为(单词, 1)的这种格式
        // 因为只有这样,后面才能根据单词作为key,来进行每个单词的出现次数的累加
        // mapToPair,其实就是将每个元素,映射为一个(v1,v2)这样的Tuple2类型的元素
        // 如果大家还记得scala里面讲的tuple,那么没错,这里的tuple2就是scala类型,包含了两个值
        // mapToPair这个算子,要求的是与PairFunction配合使用,第一个泛型参数代表了输入类型
        // 第二个和第三个泛型参数,代表的输出的Tuple2的第一个值和第二个值的类型
        // JavaPairRDD的两个泛型参数,分别代表了tuple元素的第一个值和第二个值的类型
        JavaPairRDD<String, Integer> pairs = words.mapToPair(

                new PairFunction<String, String, Integer>() {

                    private static final long serialVersionUID = 1L;

                    @Override
                    public Tuple2<String, Integer> call(String word) throws Exception {
                        return new Tuple2<String, Integer>(word, 1);
                    }

                });

        // 接着,需要以单词作为key,统计每个单词出现的次数
        // 这里要使用reduceByKey这个算子,对每个key对应的value,都进行reduce操作
        // 比如JavaPairRDD中有几个元素,分别为(hello, 1) (hello, 1) (hello, 1) (world, 1)
        // reduce操作,相当于是把第一个值和第二个值进行计算,然后再将结果与第三个值进行计算
        // 比如这里的hello,那么就相当于是,首先是1 + 1 = 2,然后再将2 + 1 = 3
        // 最后返回的JavaPairRDD中的元素,也是tuple,但是第一个值就是每个key,第二个值就是key的value
        // reduce之后的结果,相当于就是每个单词出现的次数
        JavaPairRDD<String, Integer> wordCounts = pairs.reduceByKey(

                new Function2<Integer, Integer, Integer>() {

                    private static final long serialVersionUID = 1L;

                    @Override
                    public Integer call(Integer v1, Integer v2) throws Exception {
                        return v1 + v2;
                    }

                });

        // 到这里为止,我们通过几个Spark算子操作,已经统计出了单词的次数
        // 但是,之前我们使用的flatMap、mapToPair、reduceByKey这种操作,都叫做transformation操作
        // 一个Spark应用中,光是有transformation操作,是不行的,是不会执行的,必须要有一种叫做action
        // 接着,最后,可以使用一种叫做action操作的,比如说,foreach,来触发程序的执行
        wordCounts.foreach(new VoidFunction<Tuple2<String, Integer>>() {

            private static final long serialVersionUID = 1L;

            @Override
            public void call(Tuple2<String, Integer> wordCount) throws Exception {
                System.out.println(wordCount._1 + " appeared " + wordCount._2 + " times.");
            }

        });

        sc.close();
    }
}

 

 

 

如果要在spark集群上运行,需要修改的,只有两个地方

第一,将SparkConf的setMaster()方法给删掉,默认它自己会去连接

第二,我们针对的不是本地文件了,修改为hadoop hdfs上的真正的存储大数据的文件,例子如下。

JavaRDD<String> lines = sc.textFile("hdfs://spark1:9000/spark.txt");

spark demo2

 

 

package com.tg.spark;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.storage.StorageLevel;
/**
 * 引用外部文件系统的数据集(HDFS)创建RDD
 *  匿名内部类定义函数传给spark
 *
 */
        public class RDDOps {
            //完成对所有行的长度求和
            public static void main(String[] args) {

                SparkConf conf=new SparkConf();
                conf.set("spark.testing.memory", "2147480000");     //因为jvm无法获得足够的资源
                JavaSparkContext sc = new JavaSparkContext("local", "First Spark App",conf);
                System.out.println(sc);

                //通过hdfs上的文件定义一个RDD 这个数据暂时还没有加载到内存,也没有在上面执行动作,lines仅仅指向这个文件
                JavaRDD<String> lines = sc.textFile("hdfs://master:9000/testFile/README.md");

                //定义lineLengths作为Map转换的结果 由于惰性,不会立即计算lineLengths
                //第一个参数为传入的内容,第二个参数为函数操作完后返回的结果类型
                JavaRDD<Integer> lineLengths = lines.map(new Function<String, Integer>() {
                    public Integer call(String s) {
                        System.out.println("每行长度"+s.length());
                        return s.length(); }
                });
                //运行reduce  这是一个动作action  这时候,spark才将计算拆分成不同的task,
                //并运行在独立的机器上,每台机器运行他自己的map部分和本地的reducation,并返回结果集给去驱动程序
                int totalLength = lineLengths.reduce(new Function2<Integer, Integer, Integer>() {
                    public Integer call(Integer a, Integer b) { return a + b; }
                });

                System.out.println(totalLength);
                //为了以后复用  持久化到内存...
                lineLengths.persist(StorageLevel.MEMORY_ONLY());


            }
        }

 

 

 

用spark-submit命令提交任务运行,具体使用查看:spark-submit --help

示例:

spark-submit --master spark://eb174:7077 --name WordCountByscala --class com.hq.WordCount --executor-memory 1G --total-executor-cores 2 ~/test/WordCount.jar hdfs://eb170:8020/user/ebupt/text

结果示例

 

 

 

 

        1 Spark assembly has been built with Hive, including Datanucleus jars on classpath
        2 Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
        3 19:24:51 INFO SecurityManager: Changing view acls to: ebupt,
        4 19:24:51 INFO SecurityManager: Changing modify acls to: ebupt,
        5 19:24:51 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(ebupt, ); users with modify permissions: Set(ebupt, )
        6 9:24:52 INFO Slf4jLogger: Slf4jLogger started
        7 9:24:52 INFO Remoting: Starting remoting
        8 9:24:52 INFO Remoting: Remoting started; listening on addresses :[akka.tcp://sparkDriver@eb174:56344]
        9 9:24:52 INFO Remoting: Remoting now listens on addresses: [akka.tcp://sparkDriver@eb174:56344]
        10 19:24:52 INFO Utils: Successfully started service 'sparkDriver' on port 56344.
        11 19:24:52 INFO SparkEnv: Registering MapOutputTracker
        12 19:24:52 INFO SparkEnv: Registering BlockManagerMaster
        13 19:24:52 INFO DiskBlockManager: Created local directory at /tmp/spark-local-20141010192452-3398
        14 19:24:52 INFO Utils: Successfully started service 'Connection manager for block manager' on port 41110.
        15 19:24:52 INFO ConnectionManager: Bound socket to port 41110 with id = ConnectionManagerId(eb174,41110)
        16 19:24:52 INFO MemoryStore: MemoryStore started with capacity 265.4 MB
        17 19:24:52 INFO BlockManagerMaster: Trying to register BlockManager
        18 19:24:52 INFO BlockManagerMasterActor: Registering block manager eb174:41110 with 265.4 MB RAM
        19 19:24:52 INFO BlockManagerMaster: Registered BlockManager
        20 19:24:52 INFO HttpFileServer: HTTP File server directory is /tmp/spark-8051667e-bfdb-4ecd-8111-52992b16bb13
        21 19:24:52 INFO HttpServer: Starting HTTP Server
        22 19:24:52 INFO Utils: Successfully started service 'HTTP file server' on port 48233.
        23 19:24:53 INFO Utils: Successfully started service 'SparkUI' on port 4040.
        24 19:24:53 INFO SparkUI: Started SparkUI at http://eb174:4040
        25 19:24:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
        26 19:24:53 INFO SparkContext: Added JAR file:/home/ebupt/test/WordCountByscala.jar at http://10.1.69.174:48233/jars/WordCountByscala.jar with timestamp 1412940293532
        27 19:24:53 INFO AppClient$ClientActor: Connecting to master spark://eb174:7077...
        28 19:24:53 INFO SparkDeploySchedulerBackend: SchedulerBackend is ready for scheduling beginning after reached minRegisteredResourcesRatio: 0.0
        29 19:24:53 INFO MemoryStore: ensureFreeSpace(163705) called with curMem=0, maxMem=278302556
        30 19:24:53 INFO MemoryStore: Block broadcast_0 stored as values in memory (estimated size 159.9 KB, free 265.3 MB)
        31 19:24:53 INFO SparkDeploySchedulerBackend: Connected to Spark cluster with app ID app-20141010192453-0009
        32 19:24:53 INFO AppClient$ClientActor: Executor added: app-20141010192453-0009/0 on worker-20141008204132-eb176-49618 (eb176:49618) with 1 cores
        33 19:24:53 INFO SparkDeploySchedulerBackend: Granted executor ID app-20141010192453-0009/0 on hostPort eb176:49618 with 1 cores, 1024.0 MB RAM
        34 19:24:53 INFO AppClient$ClientActor: Executor added: app-20141010192453-0009/1 on worker-20141008204132-eb175-56337 (eb175:56337) with 1 cores
        35 19:24:53 INFO SparkDeploySchedulerBackend: Granted executor ID app-20141010192453-0009/1 on hostPort eb175:56337 with 1 cores, 1024.0 MB RAM
        36 19:24:53 INFO AppClient$ClientActor: Executor updated: app-20141010192453-0009/0 is now RUNNING
        37 19:24:53 INFO AppClient$ClientActor: Executor updated: app-20141010192453-0009/1 is now RUNNING
        38 19:24:53 INFO MemoryStore: ensureFreeSpace(12633) called with curMem=163705, maxMem=278302556
        39 19:24:53 INFO MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 12.3 KB, free 265.2 MB)
        40 19:24:53 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on eb174:41110 (size: 12.3 KB, free: 265.4 MB)
        41 19:24:53 INFO BlockManagerMaster: Updated info of block broadcast_0_piece0
        42 19:24:54 INFO FileInputFormat: Total input paths to process : 1
        43 19:24:54 INFO SparkContext: Starting job: collect at WordCount.scala:26
        44 19:24:54 INFO DAGScheduler: Registering RDD 3 (map at WordCount.scala:26)
        45 19:24:54 INFO DAGScheduler: Got job 0 (collect at WordCount.scala:26) with 2 output partitions (allowLocal=false)
        46 19:24:54 INFO DAGScheduler: Final stage: Stage 0(collect at WordCount.scala:26)
        47 19:24:54 INFO DAGScheduler: Parents of final stage: List(Stage 1)
        48 19:24:54 INFO DAGScheduler: Missing parents: List(Stage 1)
        49 19:24:54 INFO DAGScheduler: Submitting Stage 1 (MappedRDD[3] at map at WordCount.scala:26), which has no missing parents
        50 19:24:54 INFO MemoryStore: ensureFreeSpace(3400) called with curMem=176338, maxMem=278302556
        51 19:24:54 INFO MemoryStore: Block broadcast_1 stored as values in memory (estimated size 3.3 KB, free 265.2 MB)
        52 19:24:54 INFO MemoryStore: ensureFreeSpace(2082) called with curMem=179738, maxMem=278302556
        53 19:24:54 INFO MemoryStore: Block broadcast_1_piece0 stored as bytes in memory (estimated size 2.0 KB, free 265.2 MB)
        54 19:24:54 INFO BlockManagerInfo: Added broadcast_1_piece0 in memory on eb174:41110 (size: 2.0 KB, free: 265.4 MB)
        55 19:24:54 INFO BlockManagerMaster: Updated info of block broadcast_1_piece0
        56 19:24:54 INFO DAGScheduler: Submitting 2 missing tasks from Stage 1 (MappedRDD[3] at map at WordCount.scala:26)
        57 19:24:54 INFO TaskSchedulerImpl: Adding task set 1.0 with 2 tasks
        58 19:24:56 INFO SparkDeploySchedulerBackend: Registered executor: Actor[akka.tcp://sparkExecutor@eb176:35482/user/Executor#1456950111] with ID 0
        59 19:24:56 INFO TaskSetManager: Starting task 0.0 in stage 1.0 (TID 0, eb176, ANY, 1238 bytes)
        60 19:24:56 INFO SparkDeploySchedulerBackend: Registered executor: Actor[akka.tcp://sparkExecutor@eb175:35502/user/Executor#-1231100997] with ID 1
        61 19:24:56 INFO TaskSetManager: Starting task 1.0 in stage 1.0 (TID 1, eb175, ANY, 1238 bytes)
        62 19:24:56 INFO BlockManagerMasterActor: Registering block manager eb176:33296 with 530.3 MB RAM
        63 19:24:56 INFO BlockManagerMasterActor: Registering block manager eb175:32903 with 530.3 MB RAM
        64 19:24:57 INFO ConnectionManager: Accepted connection from [eb176/10.1.69.176:39218]
        65 19:24:57 INFO ConnectionManager: Accepted connection from [eb175/10.1.69.175:55227]
        66 19:24:57 INFO SendingConnection: Initiating connection to [eb176/10.1.69.176:33296]
        67 19:24:57 INFO SendingConnection: Initiating connection to [eb175/10.1.69.175:32903]
        68 19:24:57 INFO SendingConnection: Connected to [eb175/10.1.69.175:32903], 1 messages pending
        69 19:24:57 INFO SendingConnection: Connected to [eb176/10.1.69.176:33296], 1 messages pending
        70 19:24:57 INFO BlockManagerInfo: Added broadcast_1_piece0 in memory on eb175:32903 (size: 2.0 KB, free: 530.3 MB)
        71 19:24:57 INFO BlockManagerInfo: Added broadcast_1_piece0 in memory on eb176:33296 (size: 2.0 KB, free: 530.3 MB)
        72 19:24:57 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on eb176:33296 (size: 12.3 KB, free: 530.3 MB)
        73 19:24:57 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on eb175:32903 (size: 12.3 KB, free: 530.3 MB)
        74 19:24:58 INFO TaskSetManager: Finished task 1.0 in stage 1.0 (TID 1) in 1697 ms on eb175 (1/2)
        75 19:24:58 INFO TaskSetManager: Finished task 0.0 in stage 1.0 (TID 0) in 1715 ms on eb176 (2/2)
        76 19:24:58 INFO TaskSchedulerImpl: Removed TaskSet 1.0, whose tasks have all completed, from pool
        77 19:24:58 INFO DAGScheduler: Stage 1 (map at WordCount.scala:26) finished in 3.593 s
        78 19:24:58 INFO DAGScheduler: looking for newly runnable stages
        79 19:24:58 INFO DAGScheduler: running: Set()
        80 19:24:58 INFO DAGScheduler: waiting: Set(Stage 0)
        81 19:24:58 INFO DAGScheduler: failed: Set()
        82 19:24:58 INFO DAGScheduler: Missing parents for Stage 0: List()
        83 19:24:58 INFO DAGScheduler: Submitting Stage 0 (ShuffledRDD[4] at reduceByKey at WordCount.scala:26), which is now runnable
        84 19:24:58 INFO MemoryStore: ensureFreeSpace(2096) called with curMem=181820, maxMem=278302556
        85 19:24:58 INFO MemoryStore: Block broadcast_2 stored as values in memory (estimated size 2.0 KB, free 265.2 MB)
        86 19:24:58 INFO MemoryStore: ensureFreeSpace(1338) called with curMem=183916, maxMem=278302556
        87 19:24:58 INFO MemoryStore: Block broadcast_2_piece0 stored as bytes in memory (estimated size 1338.0 B, free 265.2 MB)
        88 19:24:58 INFO BlockManagerInfo: Added broadcast_2_piece0 in memory on eb174:41110 (size: 1338.0 B, free: 265.4 MB)
        89 19:24:58 INFO BlockManagerMaster: Updated info of block broadcast_2_piece0
        90 19:24:58 INFO DAGScheduler: Submitting 2 missing tasks from Stage 0 (ShuffledRDD[4] at reduceByKey at WordCount.scala:26)
        91 19:24:58 INFO TaskSchedulerImpl: Adding task set 0.0 with 2 tasks
        92 19:24:58 INFO TaskSetManager: Starting task 0.0 in stage 0.0 (TID 2, eb175, PROCESS_LOCAL, 1008 bytes)
        93 19:24:58 INFO TaskSetManager: Starting task 1.0 in stage 0.0 (TID 3, eb176, PROCESS_LOCAL, 1008 bytes)
        94 19:24:58 INFO BlockManagerInfo: Added broadcast_2_piece0 in memory on eb175:32903 (size: 1338.0 B, free: 530.3 MB)
        95 19:24:58 INFO BlockManagerInfo: Added broadcast_2_piece0 in memory on eb176:33296 (size: 1338.0 B, free: 530.3 MB)
        96 19:24:58 INFO MapOutputTrackerMasterActor: Asked to send map output locations for shuffle 0 to sparkExecutor@eb175:59119
        97 19:24:58 INFO MapOutputTrackerMaster: Size of output statuses for shuffle 0 is 144 bytes
        98 19:24:58 INFO MapOutputTrackerMasterActor: Asked to send map output locations for shuffle 0 to sparkExecutor@eb176:39028
        99 19:24:58 INFO TaskSetManager: Finished task 1.0 in stage 0.0 (TID 3) in 109 ms on eb176 (1/2)
        10 19:24:58 INFO TaskSetManager: Finished task 0.0 in stage 0.0 (TID 2) in 120 ms on eb175 (2/2)
        10 19:24:58 INFO DAGScheduler: Stage 0 (collect at WordCount.scala:26) finished in 0.123 s
        10 19:24:58 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool
        10 19:24:58 INFO SparkContext: Job finished: collect at WordCount.scala:26, took 3.815637915 s
        104 (scala,1)
        105 (Function2,1)
        106 (JavaSparkContext,1)
        107 (JavaRDD,1)
        108 (Tuple2,1)
        109 (,1)
        110 (org,7)
        111 (apache,7)
        112 (JavaPairRDD,1)
        113 (java,7)
        114 (function,4)
        115 (api,7)
        116 (Function,1)
        117 (PairFunction,1)
        118 (spark,7)
        119 (FlatMapFunction,1)
        120 (import,8)
        121 19:24:58 INFO SparkUI: Stopped Spark web UI at http://eb174:4040
        122 19:24:58 INFO DAGScheduler: Stopping DAGScheduler
        123 19:24:58 INFO SparkDeploySchedulerBackend: Shutting down all executors
        124 19:24:58 INFO SparkDeploySchedulerBackend: Asking each executor to shut down
        125 19:24:58 INFO ConnectionManager: Removing SendingConnection to ConnectionManagerId(eb176,33296)
        126 19:24:58 INFO ConnectionManager: Removing ReceivingConnection to ConnectionManagerId(eb176,33296)
        127 19:24:58 ERROR ConnectionManager: Corresponding SendingConnection to ConnectionManagerId(eb176,33296) not found
        128 19:24:58 INFO ConnectionManager: Removing ReceivingConnection to ConnectionManagerId(eb175,32903)
        129 19:24:58 INFO ConnectionManager: Removing SendingConnection to ConnectionManagerId(eb175,32903)
        130 19:24:58 INFO ConnectionManager: Removing SendingConnection to ConnectionManagerId(eb175,32903)
        131 19:24:58 INFO ConnectionManager: Key not valid ? sun.nio.ch.SelectionKeyImpl@5e92c11b
        132 19:24:58 INFO ConnectionManager: key already cancelled ? sun.nio.ch.SelectionKeyImpl@5e92c11b
        133 java.nio.channels.CancelledKeyException
        134 at org.apache.spark.network.ConnectionManager.run(ConnectionManager.scala:310)
        135 at org.apache.spark.network.ConnectionManager$$anon$4.run(ConnectionManager.scala:139)
        136 14/10/10 19:24:59 INFO MapOutputTrackerMasterActor: MapOutputTrackerActor stopped!
                137 14/10/10 19:24:59 INFO ConnectionManager: Selector thread was interrupted!
                138 14/10/10 19:24:59 INFO ConnectionManager: Removing ReceivingConnection to ConnectionManagerId(eb176,33296)
        139 19:24:59 ERROR ConnectionManager: Corresponding SendingConnection to ConnectionManagerId(eb176,33296) not found
        140 19:24:59 INFO ConnectionManager: Removing SendingConnection to ConnectionManagerId(eb176,33296)
        141 19:24:59 WARN ConnectionManager: All connections not cleaned up
        142 19:24:59 INFO ConnectionManager: ConnectionManager stopped
        143 19:24:59 INFO MemoryStore: MemoryStore cleared
        144 19:24:59 INFO BlockManager: BlockManager stopped
        145 19:24:59 INFO BlockManagerMaster: BlockManagerMaster stopped
        146 19:24:59 INFO SparkContext: Successfully stopped SparkContext
        147 19:24:59 INFO RemoteActorRefProvider$RemotingTerminator: Shutting down remote daemon.
        148 19:24:59 INFO RemoteActorRefProvider$RemotingTerminator: Remote daemon shut down; proceeding with flushing remote transports.
        149 19:24:59 INFO Remoting: Remoting shut down
        150 19:24:59 INFO RemoteActorRefProvider$RemotingTerminator: Remoting shut down.

 

 

 

 

 

 

        其他:spark使用parallelize方法创建RDD

通过调用SparkContext的parallelize方法,在一个已经存在的集合上创建的(一个Seq对象)。集合的对象将会被拷贝,创建出一个可以被并行操作的分布式数据集。一旦分布式数据集(distData)被创建好,它们将可以被并行操作

        RDD相关博客
http://blog.csdn.net/qq_28945021/article/details/51601163


其他参考:http://www.cnblogs.com/byrhuangqiang/p/4017725.html

http://blog.csdn.net/xiefu5hh/article/details/51781074

Transformation:转换算子

map :一个个传入一个个返回,生成一类rdd;

filter(func) :对调用filter的RDD数据集中的每个元素都使用func,然后返回一个包含使func为true的元素构成的RDD

flatmap:一个个传入返回集合,生成一类rdd

mapPartitions:迭代整个rdd返回一类rdd。

K,V

keyBy:一个个传入,返回key,value的rdd,value是传入参数,key是返回值。

mapValues:k、v的rdd传入v,返回修改v,一个修改V的rdd。

flatMapValues:同上,一个返回一个集合,扩展了rdd,k同传入value的k相同。


reduce其实是讲RDD中的所有元素进行合并,当运行call方法时,会传入两个参数,在call方法中将两个参数合并后返回,而这个返回值回合一个新的RDD中的元素再次传入call方法中,继续合并,直到合并到只剩下一个元素时。

而reduceByKey参考MapValues不难知道,他是仅将RDD中所有K,V对中K值相同的V进行合并。


union,join和groupByKey
当要将两个RDD合并时,便要用到union和join,其中union只是简单的将两个RDD累加起来,可以看做List的addAll方法。就想List中一样,当使用union及join时,必须保证两个RDD的泛型是一致的。

groupByKey:按照key来分组,value是一个数组JavaPairRDD<String, Iterable<Integer>>

join是将两个PairRDD合并,并将有相同key的元素分为一组,可以理解为groupByKey和Union的结合

sample,cartesian

sample 用于抽样,三个参数:

withReplacement 表示样本是否返回,true表示放回

fraction:抽象比例

seed:随机数生成种子

cartesian :两个rdd用于球笛卡尔积。


action算子:

collect():一般在filter或者足够小的结果的时候,再用collect封装返回一个数组

count():返回的是dataset中的element的个数

first():返回的是dataset中的第一个元素

take(n):返回前n个elements

takeSample(withReplacement,num,seed):抽样返回一个dataset中的num个元素,随机种子seed

saveAsTextFile(path):把dataset写到一个text file中,或者hdfs,或者hdfs支持的文件系统中,spark把每条记录都转换为一行记录,然后写到file中

saveAsSequenceFile(path):只能用在key-value对上,然后生成SequenceFile写到本地或者hadoop文件系统

countByKey():返回的是key对应的个数的一个map,作用于一个RDD

foreach(func):对dataset中的每个元素都使用func

 

 

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
下面是一个简单的Spark WordCount程序: ```python from pyspark import SparkConf, SparkContext conf = SparkConf().setAppName("WordCount") sc = SparkContext(conf=conf) text_file = sc.textFile("hdfs://localhost:9000/input/sample.txt") words = text_file.flatMap(lambda line: line.split(" ")) word_counts = words.map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b) word_counts.saveAsTextFile("hdfs://localhost:9000/output/wordcount") ``` 程序的功能是统计一个文本文件中每个单词出现的次数,并将结果保存到HDFS上。 下面是代码的详细解释: 首先,我们导入了`SparkConf`和`SparkContext`。这两个类是Spark的核心类,`SparkConf`用于配置Spark应用程序的属性,`SparkContext`用于连接Spark集群。 ```python from pyspark import SparkConf, SparkContext ``` 接下来,我们创建了一个`SparkConf`实例,并给应用程序命名为`WordCount`。我们还创建了一个`SparkContext`实例,并将`SparkConf`传递给它。这些代码将初始化Spark应用程序并连接到Spark集群。 ```python conf = SparkConf().setAppName("WordCount") sc = SparkContext(conf=conf) ``` 然后,我们使用`textFile()`方法从HDFS中读取输入文件,并创建一个RDD(弹性分布式数据集)。 ```python text_file = sc.textFile("hdfs://localhost:9000/input/sample.txt") ``` 接下来,我们使用`flatMap()`方法将每行文本拆分成单词,并创建一个新的RDD。 ```python words = text_file.flatMap(lambda line: line.split(" ")) ``` 然后,我们使用`map()`方法将每个单词转换为一个`(单词, 1)`的键值对,并创建一个新的RDD。 ```python word_counts = words.map(lambda word: (word, 1)) ``` 接下来,我们使用`reduceByKey()`方法对每个单词的计数进行聚合,并创建一个新的RDD。 ```python word_counts = word_counts.reduceByKey(lambda a, b: a + b) ``` 最后,我们使用`saveAsTextFile()`方法将结果保存到HDFS上,并指定输出目录。 ```python word_counts.saveAsTextFile("hdfs://localhost:9000/output/wordcount") ``` 这就是完整的Spark WordCount程序

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值