20191202Spark

login as: root
     ┌────────────────────────────────────────────────────────────────────┐
     │                        • MobaXterm 11.0 •                          │
     │            (SSH client, X-server and networking tools)             │
     │                                                                    │
     │ ➤ SSH session to root@192.168.89.105                               │
     │   • SSH compression : ✔                                            │
     │   • SSH-browser     : ✔                                            │
     │   • X11-forwarding  :(disabled or not supported by server)     │
     │   • DISPLAY         : 172.16.9.44:0.0                              │
     │                                                                    │
     │ ➤ For more info, ctrl+click on help or visit our website           │
     └────────────────────────────────────────────────────────────────────┘

Last login: Sun Dec  1 09:42:50 2019 from 192.168.89.1
-bash: /root: Is a directory
[root@SparkOnStandalone ~]# jps
1346 Jps
[root@SparkOnStandalone ~]# start-dfs.sh
Starting namenodes on [SparkOnStandalone]
SparkOnStandalone: /root/.bashrc: line 13: /root: Is a directory
SparkOnStandalone: starting namenode, logging to /usr/hadoop-2.9.2/logs/hadoop-root-namenode-SparkOnStandalone.out
SparkOnStandalone: /root/.bashrc: line 13: /root: Is a directory
SparkOnStandalone: starting datanode, logging to /usr/hadoop-2.9.2/logs/hadoop-root-datanode-SparkOnStandalone.out
Starting secondary namenodes [SparkOnStandalone]
SparkOnStandalone: /root/.bashrc: line 13: /root: Is a directory
SparkOnStandalone: starting secondarynamenode, logging to /usr/hadoop-2.9.2/logs/hadoop-root-secondarynamenode-SparkOnStandalone.out
[root@SparkOnStandalone ~]# jps
2576 DataNode
2833 SecondaryNameNode
3126 Jps
2442 NameNode
[root@SparkOnStandalone ~]# cd /usr/spark-2.4.4/
[root@SparkOnStandalone spark-2.4.4]# ls
bin  conf  data  examples  jars  kubernetes  LICENSE  licenses  logs  NOTICE  python  R  README.md  RELEASE  sbin  work  yarn
[root@SparkOnStandalone spark-2.4.4]# hdfs dfs -put README.md /
Microsoft Windows [版本 10.0.18362.476]
(c) 2019 Microsoft Corporation。保留所有权利。

C:\Users\lenovo>mysql -uroot -proot
Welcome to the MySQL monitor.  Commands end with ; or \g.
Your MySQL connection id is 1
Server version: 5.5.28 MySQL Community Server (GPL)

Copyright (c) 2000, 2012, Oracle and/or its affiliates. All rights reserved.

Oracle is a registered trademark of Oracle Corporation and/or its
affiliates. Other names may be trademarks of their respective
owners.

Type 'help;' or '\h' for help. Type '\c' to clear the current input statement.
mysql> show databases;
+--------------------+
| Database           |
+--------------------+
| information_schema |
| cmfz               |
| file               |
| hadoop             |
| jqgrid             |
| kg                 |
| lombok             |
| mysql              |
| no                 |
| novel              |
| performance_schema |
| springbootmybatis  |
| springsmblue       |
| ssm                |
| ssmgray            |
| test               |
| tms                |
| vote               |
+--------------------+
18 rows in set (0.45 sec)
mysql> use hadoop;
Database changed
mysql> show tables;
+------------------+
| Tables_in_hadoop |
+------------------+
| user             |
+------------------+
1 row in set (0.03 sec)
mysql> select * from user;
+----+------+
| id | name |
+----+------+
|  1 | maby |
+----+------+
1 row in set (0.24 sec)
mysql> select * from user
    ->
    -> ;
+----+------+------+
| id | name | sex  |
+----+------+------+
|  1 | maby | NULL |
+----+------+------+
1 row in set (0.00 sec)
mysql> desc user;
+-------+--------------+------+-----+---------+-------+
| Field | Type         | Null | Key | Default | Extra |
+-------+--------------+------+-----+---------+-------+
| id    | int(100)     | NO   | PRI | NULL    |       |
| name  | varchar(100) | NO   |     | NULL    |       |
| sex   | varchar(100) | YES  |     | NULL    |       |
+-------+--------------+------+-----+---------+-------+
3 rows in set (0.03 sec)
mysql> select * from user
    -> ;
+----+--------+-------+
| id | name   | sex   |
+----+--------+-------+
|  1 | maby   | na    |
|  2 | maba   | jdq   |
|  3 | nfakl  | jflka |
|  4 | fjdlaf | dfjql |
+----+--------+-------+
4 rows in set (0.00 sec)
[root@SparkOnStandalone ~]# yum install -y nc
Loaded plugins: fastestmirror
Loading mirror speeds from cached hostfile
 * base: mirrors.aliyun.com
 * extras: ftp.sjtu.edu.cn
 * updates: mirrors.aliyun.com
Resolving Dependencies
--> Running transaction check
---> Package nmap-ncat.x86_64 2:6.40-19.el7 will be installed
--> Processing Dependency: libpcap.so.1()(64bit) for package: 2:nmap-ncat-6.40-19.el7.x86_64
--> Running transaction check
---> Package libpcap.x86_64 14:1.5.3-11.el7 will be installed
--> Finished Dependency Resolution

Dependencies Resolved

==============================================================================================================================================
 Package                           Arch                           Version                                  Repository                    Size
==============================================================================================================================================
Installing:
 nmap-ncat                         x86_64                         2:6.40-19.el7                            base                         206 k
Installing for dependencies:
 libpcap                           x86_64                         14:1.5.3-11.el7                          base                         138 k

Transaction Summary
==============================================================================================================================================
Install  1 Package (+1 Dependent package)

Total download size: 344 k
Installed size: 740 k
Downloading packages:
(1/2): libpcap-1.5.3-11.el7.x86_64.rpm                                                                                 | 138 kB  00:00:00
(2/2): nmap-ncat-6.40-19.el7.x86_64.rpm                                                                                | 206 kB  00:00:00
----------------------------------------------------------------------------------------------------------------------------------------------
Total                                                                                                         361 kB/s | 344 kB  00:00:00
Running transaction check
Running transaction test
Transaction test succeeded
Running transaction
Warning: RPMDB altered outside of yum.
  Installing : 14:libpcap-1.5.3-11.el7.x86_64                                                                                             1/2
  Installing : 2:nmap-ncat-6.40-19.el7.x86_64                                                                                             2/2
  Verifying  : 2:nmap-ncat-6.40-19.el7.x86_64                                                                                             1/2
  Verifying  : 14:libpcap-1.5.3-11.el7.x86_64                                                                                             2/2

Installed:
  nmap-ncat.x86_64 2:6.40-19.el7

Dependency Installed:
  libpcap.x86_64 14:1.5.3-11.el7

Complete!
[root@SparkOnStandalone ~]# jps
2576 DataNode
2833 SecondaryNameNode
43379 Worker
41861 ResourceManager
90580 Jps
41975 NodeManager
2442 NameNode
43260 Master
[root@SparkOnStandalone ~]# stop-yarn.sh
stopping yarn daemons
stopping resourcemanager
resourcemanager did not stop gracefully after 5 seconds: killing with kill -9
SparkOnStandalone: /root/.bashrc: line 13: /root: Is a directory
SparkOnStandalone: stopping nodemanager
SparkOnStandalone: nodemanager did not stop gracefully after 5 seconds: killing with kill -9
no proxyserver to stop
[root@SparkOnStandalone ~]# jps
2576 DataNode
2833 SecondaryNameNode
90867 Jps
43379 Worker
2442 NameNode
43260 Master
[root@SparkOnStandalone spark-2.4.4]# bin/spark-shell --master spark://SparkOnStandalone:7077 --total-executor-cores 2
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Spark context Web UI available at http://SparkOnStandalone:4040
Spark context available as 'sc' (master = spark://SparkOnStandalone:7077, app id = app-20191202000646-0001).
Spark session available as 'spark'.
Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 2.4.4
      /_/

Using Scala version 2.11.12 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_171)
Type in expressions to have them evaluated.
Type :help for more information.

scala>
scala> sc.textFile("hdfs://SparkOnStandalone:9000/text.txt").flatMap(_.split(" "                    )).map((_,1)).groupByKey().map(t2 =>(t2._1,t2._2.size)).sortBy(t2=>t2._2,false,1                    ).collect()
res0: Array[(String, Int)] = Array((hello,3), (Kafka,1), (hadoop,1), (Flume,1))
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{Seconds, StreamingContext}

/**
  * spark streaming的转换操作
  *
  * 父DStream --> 子DStream
  */
object TransformationsTest {

  // main
  def main(args: Array[String]): Unit = {

    // 1. 初始化streaming context
    val conf = new SparkConf().setMaster("local[*]").setAppName("transformations test")

    val ssc = new StreamingContext(conf, Seconds(5))

    ssc.sparkContext.setLogLevel("ERROR")

    // 2. 构建source dstream

    val ds = ssc.socketTextStream("SparkOnStandalone", 9999);

    //3. 应用转换操作
    //--------------------------------------------------------------------------
    //    ds
    //      .flatMap(_.split(" ")) // 将line --> word
    //      .map((_, 1)) // word ---> (word,1)
    //      .filter(t2 => t2._1.startsWith("H")) // 保留DStream中以H开头的所有单词
    //      .repartition(5) // 将数据分为5个分区
    //      .print()
    //--------------------------------------------------------------------------


    //--------------------------------------------------------------------------
    // union 将两个DStream组合在一起返回一个新的DStream
    //    val ds1 = ssc.socketTextStream("SparkOnStandalone", 9999);
    //    val ds2 = ssc.socketTextStream("SparkOnStandalone", 8888);
    //    ds1
    //      .union(ds2)
    //      .print()

    //--------------------------------------------------------------------------

    //--------------------------------------------------------------------------
    // 返回微批RDD中元素的数量
    //    val ds3 = ssc.socketTextStream("SparkOnStandalone", 9999);
    //    ds3
    //      .count()
    //      .print()
    //--------------------------------------------------------------------------

    //--------------------------------------------------------------------------
    // reduce 对DStream每一个元素应用函数进行计算
    //    val ds4 = ssc.socketTextStream("SparkOnStandalone", 9999);
    //
    //    ds4
    //      .flatMap(_.split(" "))
    //      .map(strNum => strNum.toInt)
    //      .reduce((v1, v2) => v1 + v2) // 1 2 3 4 5
    //      .print()
    //--------------------------------------------------------------------------

    //--------------------------------------------------------------------------
    // countByValue  对DStream调用 返回DStream相同元素k的出现count (k,count)
    //    val ds6 = ssc.socketTextStream("SparkOnStandalone", 9999);
    //    ds6
    //      .flatMap(_.split(" "))   // Hello Hello Spark
    //      .countByValue(2)   // (Hello,2)
    //                                      // (Spark,1)
    //      .print()
    //--------------------------------------------------------------------------

    //--------------------------------------------------------------------------
    // reduceByKey 根据key应用计算函数
    //    val ds7 = ssc.socketTextStream("SparkOnStandalone", 9999);
    //    ds7
    //      .flatMap(_.split(" "))
    //      .map((_, 1))
    //      .reduceByKey(_ - _)
    //      .print()
    //--------------------------------------------------------------------------

    //--------------------------------------------------------------------------
    // join 两个DStream进行连接操作 【意义不大】  流 + 批 join
    //    val ds8 = ssc.socketTextStream("SparkOnStandalone", 9999).map((_, 1));  // (a,1)
    //    val ds9 = ssc.socketTextStream("SparkOnStandalone", 8888).map((_, 1));  // (a,1)
    //    ds8
    //      .join(ds9)  // (a,(1,1))  内连接
    //      .print()

    //--------------------------------------------------------------------------

    //--------------------------------------------------------------------------
    // cogroup
    //    val ds10 = ssc.socketTextStream("SparkOnStandalone", 9999).map((_, 1)); // (a,1)
    //    val ds11 = ssc.socketTextStream("SparkOnStandalone", 8888).map((_, 1)); // (a,1)
    //    ds10
    //      .cogroup(ds11) // (a,([1],[1]))  官方文档有误
    //      .print()
    //--------------------------------------------------------------------------

    //--------------------------------------------------------------------------
    // transform 对DStream的每一个微批RDD 应用RDD操作,并且返回一个新的DStream 【非常重点】
    // 特点: 将DStream转换为RDD,应用RDD的操作
    // 例子: 如有某系统实现抽奖功能, 但是系统黑名单的用户不允许参与抽奖
    //    1. 手机抢购  非法的用户 加入系统黑名单【用户名、IP】
    //    2. 抽奖请求(正常用户 + 黑名单用户)
    //       流(抽奖请求) + 批(黑名单)
    val ds12 = ssc.socketTextStream("SparkOnStandalone", 9999);


    val blacklist = List((1, "zs"), (2, "ww"), (3, "tq"))
    val blacklistRDD = ssc.sparkContext.makeRDD(blacklist)

    // 流格式:userid method url
    ds12
      .map(request => {
        val arr = request.split(" ")
        val userId: Int = arr(0).toInt
        val method: String = arr(1)
        val url: String = arr(2)
        (userId, (method, url))
      })
      .transform(streamRDD => { // 流 --> 流RDD.leftOuterJoin(批RDD)
        streamRDD.leftOuterJoin(blacklistRDD)
      })
      // 最终仅保留 白名单用户  黑名单用户: (1,((GET,/xxx),Some(zs)))
      //                    白名单用户: (4,((DELETE,/xxx2),None))
      .filter(t2 => t2._2._2.isEmpty) // 是不是为空?
      .map(t2 => (t2._1, t2._2._1._1, t2._2._1._2))
      .print()



    //--------------------------------------------------------------------------


    //4. 启动流应用
    ssc.start()

    //5. 优雅的关闭应用
    ssc.awaitTermination()
  }
}
"C:\Program Files\Java\jdk1.8.0_121\bin\java.exe" -DHADOOP_USER_NAME=root "-javaagent:D:\IntelliJ IDEA 2018.2.5\lib\idea_rt.jar=4594:D:\IntelliJ IDEA 2018.2.5\bin" -Dfile.encoding=UTF-8 -classpath "C:\Program Files\Java\jdk1.8.0_121\jre\lib\charsets.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\deploy.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\ext\access-bridge-64.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\ext\cldrdata.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\ext\dnsns.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\ext\jaccess.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\ext\jfxrt.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\ext\localedata.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\ext\nashorn.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\ext\sunec.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\ext\sunjce_provider.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\ext\sunmscapi.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\ext\sunpkcs11.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\ext\zipfs.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\javaws.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\jce.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\jfr.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\jfxswt.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\jsse.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\management-agent.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\plugin.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\resources.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\rt.jar;D:\IntelliJ IDEA 2018.2.5\IEDAMBY\sparkstreaming-day2\target\classes;D:\.m2\org\apache\spark\spark-core_2.11\2.4.4\spark-core_2.11-2.4.4.jar;D:\.m2\com\thoughtworks\paranamer\paranamer\2.8\paranamer-2.8.jar;D:\.m2\org\apache\avro\avro\1.8.2\avro-1.8.2.jar;D:\.m2\org\codehaus\jackson\jackson-core-asl\1.9.13\jackson-core-asl-1.9.13.jar;D:\.m2\org\codehaus\jackson\jackson-mapper-asl\1.9.13\jackson-mapper-asl-1.9.13.jar;D:\.m2\org\apache\commons\commons-compress\1.8.1\commons-compress-1.8.1.jar;D:\.m2\org\tukaani\xz\1.5\xz-1.5.jar;D:\.m2\org\apache\avro\avro-mapred\1.8.2\avro-mapred-1.8.2-hadoop2.jar;D:\.m2\org\apache\avro\avro-ipc\1.8.2\avro-ipc-1.8.2.jar;D:\.m2\commons-codec\commons-codec\1.9\commons-codec-1.9.jar;D:\.m2\com\twitter\chill_2.11\0.9.3\chill_2.11-0.9.3.jar;D:\.m2\com\esotericsoftware\kryo-shaded\4.0.2\kryo-shaded-4.0.2.jar;D:\.m2\com\esotericsoftware\minlog\1.3.0\minlog-1.3.0.jar;D:\.m2\org\objenesis\objenesis\2.5.1\objenesis-2.5.1.jar;D:\.m2\com\twitter\chill-java\0.9.3\chill-java-0.9.3.jar;D:\.m2\org\apache\xbean\xbean-asm6-shaded\4.8\xbean-asm6-shaded-4.8.jar;D:\.m2\org\apache\hadoop\hadoop-client\2.6.5\hadoop-client-2.6.5.jar;D:\.m2\org\apache\hadoop\hadoop-common\2.6.5\hadoop-common-2.6.5.jar;D:\.m2\commons-cli\commons-cli\1.2\commons-cli-1.2.jar;D:\.m2\xmlenc\xmlenc\0.52\xmlenc-0.52.jar;D:\.m2\commons-httpclient\commons-httpclient\3.1\commons-httpclient-3.1.jar;D:\.m2\commons-io\commons-io\2.4\commons-io-2.4.jar;D:\.m2\commons-collections\commons-collections\3.2.2\commons-collections-3.2.2.jar;D:\.m2\commons-lang\commons-lang\2.6\commons-lang-2.6.jar;D:\.m2\commons-configuration\commons-configuration\1.6\commons-configuration-1.6.jar;D:\.m2\commons-digester\commons-digester\1.8\commons-digester-1.8.jar;D:\.m2\commons-beanutils\commons-beanutils\1.7.0\commons-beanutils-1.7.0.jar;D:\.m2\com\google\protobuf\protobuf-java\2.5.0\protobuf-java-2.5.0.jar;D:\.m2\com\google\code\gson\gson\2.2.4\gson-2.2.4.jar;D:\.m2\org\apache\hadoop\hadoop-auth\2.6.5\hadoop-auth-2.6.5.jar;D:\.m2\org\apache\httpcomponents\httpclient\4.2.5\httpclient-4.2.5.jar;D:\.m2\org\apache\httpcomponents\httpcore\4.2.4\httpcore-4.2.4.jar;D:\.m2\org\apache\directory\server\apacheds-kerberos-codec\2.0.0-M15\apacheds-kerberos-codec-2.0.0-M15.jar;D:\.m2\org\apache\directory\server\apacheds-i18n\2.0.0-M15\apacheds-i18n-2.0.0-M15.jar;D:\.m2\org\apache\directory\api\api-asn1-api\1.0.0-M20\api-asn1-api-1.0.0-M20.jar;D:\.m2\org\apache\directory\api\api-util\1.0.0-M20\api-util-1.0.0-M20.jar;D:\.m2\org\apache\curator\curator-client\2.6.0\curator-client-2.6.0.jar;D:\.m2\org\htrace\htrace-core\3.0.4\htrace-core-3.0.4.jar;D:\.m2\org\apache\hadoop\hadoop-hdfs\2.6.5\hadoop-hdfs-2.6.5.jar;D:\.m2\org\mortbay\jetty\jetty-util\6.1.26\jetty-util-6.1.26.jar;D:\.m2\xerces\xercesImpl\2.9.1\xercesImpl-2.9.1.jar;D:\.m2\xml-apis\xml-apis\1.3.04\xml-apis-1.3.04.jar;D:\.m2\org\apache\hadoop\hadoop-mapreduce-client-app\2.6.5\hadoop-mapreduce-client-app-2.6.5.jar;D:\.m2\org\apache\hadoop\hadoop-mapreduce-client-common\2.6.5\hadoop-mapreduce-client-common-2.6.5.jar;D:\.m2\org\apache\hadoop\hadoop-yarn-client\2.6.5\hadoop-yarn-client-2.6.5.jar;D:\.m2\org\apache\hadoop\hadoop-yarn-server-common\2.6.5\hadoop-yarn-server-common-2.6.5.jar;D:\.m2\org\apache\hadoop\hadoop-mapreduce-client-shuffle\2.6.5\hadoop-mapreduce-client-shuffle-2.6.5.jar;D:\.m2\org\apache\hadoop\hadoop-yarn-api\2.6.5\hadoop-yarn-api-2.6.5.jar;D:\.m2\org\apache\hadoop\hadoop-mapreduce-client-core\2.6.5\hadoop-mapreduce-client-core-2.6.5.jar;D:\.m2\org\apache\hadoop\hadoop-yarn-common\2.6.5\hadoop-yarn-common-2.6.5.jar;D:\.m2\javax\xml\bind\jaxb-api\2.2.2\jaxb-api-2.2.2.jar;D:\.m2\javax\xml\stream\stax-api\1.0-2\stax-api-1.0-2.jar;D:\.m2\org\codehaus\jackson\jackson-jaxrs\1.9.13\jackson-jaxrs-1.9.13.jar;D:\.m2\org\codehaus\jackson\jackson-xc\1.9.13\jackson-xc-1.9.13.jar;D:\.m2\org\apache\hadoop\hadoop-mapreduce-client-jobclient\2.6.5\hadoop-mapreduce-client-jobclient-2.6.5.jar;D:\.m2\org\apache\hadoop\hadoop-annotations\2.6.5\hadoop-annotations-2.6.5.jar;D:\.m2\org\apache\spark\spark-launcher_2.11\2.4.4\spark-launcher_2.11-2.4.4.jar;D:\.m2\org\apache\spark\spark-kvstore_2.11\2.4.4\spark-kvstore_2.11-2.4.4.jar;D:\.m2\org\fusesource\leveldbjni\leveldbjni-all\1.8\leveldbjni-all-1.8.jar;D:\.m2\com\fasterxml\jackson\core\jackson-core\2.6.7\jackson-core-2.6.7.jar;D:\.m2\com\fasterxml\jackson\core\jackson-annotations\2.6.7\jackson-annotations-2.6.7.jar;D:\.m2\org\apache\spark\spark-network-common_2.11\2.4.4\spark-network-common_2.11-2.4.4.jar;D:\.m2\org\apache\spark\spark-network-shuffle_2.11\2.4.4\spark-network-shuffle_2.11-2.4.4.jar;D:\.m2\org\apache\spark\spark-unsafe_2.11\2.4.4\spark-unsafe_2.11-2.4.4.jar;D:\.m2\javax\activation\activation\1.1.1\activation-1.1.1.jar;D:\.m2\org\apache\curator\curator-recipes\2.6.0\curator-recipes-2.6.0.jar;D:\.m2\org\apache\curator\curator-framework\2.6.0\curator-framework-2.6.0.jar;D:\.m2\com\google\guava\guava\16.0.1\guava-16.0.1.jar;D:\.m2\org\apache\zookeeper\zookeeper\3.4.6\zookeeper-3.4.6.jar;D:\.m2\javax\servlet\javax.servlet-api\3.1.0\javax.servlet-api-3.1.0.jar;D:\.m2\org\apache\commons\commons-lang3\3.5\commons-lang3-3.5.jar;D:\.m2\org\apache\commons\commons-math3\3.4.1\commons-math3-3.4.1.jar;D:\.m2\com\google\code\findbugs\jsr305\1.3.9\jsr305-1.3.9.jar;D:\.m2\org\slf4j\slf4j-api\1.7.16\slf4j-api-1.7.16.jar;D:\.m2\org\slf4j\jul-to-slf4j\1.7.16\jul-to-slf4j-1.7.16.jar;D:\.m2\org\slf4j\jcl-over-slf4j\1.7.16\jcl-over-slf4j-1.7.16.jar;D:\.m2\log4j\log4j\1.2.17\log4j-1.2.17.jar;D:\.m2\org\slf4j\slf4j-log4j12\1.7.16\slf4j-log4j12-1.7.16.jar;D:\.m2\com\ning\compress-lzf\1.0.3\compress-lzf-1.0.3.jar;D:\.m2\org\xerial\snappy\snappy-java\1.1.7.3\snappy-java-1.1.7.3.jar;D:\.m2\org\lz4\lz4-java\1.4.0\lz4-java-1.4.0.jar;D:\.m2\com\github\luben\zstd-jni\1.3.2-2\zstd-jni-1.3.2-2.jar;D:\.m2\org\roaringbitmap\RoaringBitmap\0.7.45\RoaringBitmap-0.7.45.jar;D:\.m2\org\roaringbitmap\shims\0.7.45\shims-0.7.45.jar;D:\.m2\commons-net\commons-net\3.1\commons-net-3.1.jar;D:\.m2\org\scala-lang\scala-library\2.11.12\scala-library-2.11.12.jar;D:\.m2\org\json4s\json4s-jackson_2.11\3.5.3\json4s-jackson_2.11-3.5.3.jar;D:\.m2\org\json4s\json4s-core_2.11\3.5.3\json4s-core_2.11-3.5.3.jar;D:\.m2\org\json4s\json4s-ast_2.11\3.5.3\json4s-ast_2.11-3.5.3.jar;D:\.m2\org\json4s\json4s-scalap_2.11\3.5.3\json4s-scalap_2.11-3.5.3.jar;D:\.m2\org\scala-lang\modules\scala-xml_2.11\1.0.6\scala-xml_2.11-1.0.6.jar;D:\.m2\org\glassfish\jersey\core\jersey-client\2.22.2\jersey-client-2.22.2.jar;D:\.m2\javax\ws\rs\javax.ws.rs-api\2.0.1\javax.ws.rs-api-2.0.1.jar;D:\.m2\org\glassfish\hk2\hk2-api\2.4.0-b34\hk2-api-2.4.0-b34.jar;D:\.m2\org\glassfish\hk2\hk2-utils\2.4.0-b34\hk2-utils-2.4.0-b34.jar;D:\.m2\org\glassfish\hk2\external\aopalliance-repackaged\2.4.0-b34\aopalliance-repackaged-2.4.0-b34.jar;D:\.m2\org\glassfish\hk2\external\javax.inject\2.4.0-b34\javax.inject-2.4.0-b34.jar;D:\.m2\org\glassfish\hk2\hk2-locator\2.4.0-b34\hk2-locator-2.4.0-b34.jar;D:\.m2\org\javassist\javassist\3.18.1-GA\javassist-3.18.1-GA.jar;D:\.m2\org\glassfish\jersey\core\jersey-common\2.22.2\jersey-common-2.22.2.jar;D:\.m2\javax\annotation\javax.annotation-api\1.2\javax.annotation-api-1.2.jar;D:\.m2\org\glassfish\jersey\bundles\repackaged\jersey-guava\2.22.2\jersey-guava-2.22.2.jar;D:\.m2\org\glassfish\hk2\osgi-resource-locator\1.0.1\osgi-resource-locator-1.0.1.jar;D:\.m2\org\glassfish\jersey\core\jersey-server\2.22.2\jersey-server-2.22.2.jar;D:\.m2\org\glassfish\jersey\media\jersey-media-jaxb\2.22.2\jersey-media-jaxb-2.22.2.jar;D:\.m2\javax\validation\validation-api\1.1.0.Final\validation-api-1.1.0.Final.jar;D:\.m2\org\glassfish\jersey\containers\jersey-container-servlet\2.22.2\jersey-container-servlet-2.22.2.jar;D:\.m2\org\glassfish\jersey\containers\jersey-container-servlet-core\2.22.2\jersey-container-servlet-core-2.22.2.jar;D:\.m2\io\netty\netty-all\4.1.17.Final\netty-all-4.1.17.Final.jar;D:\.m2\io\netty\netty\3.9.9.Final\netty-3.9.9.Final.jar;D:\.m2\com\clearspring\analytics\stream\2.7.0\stream-2.7.0.jar;D:\.m2\io\dropwizard\metrics\metrics-core\3.1.5\metrics-core-3.1.5.jar;D:\.m2\io\dropwizard\metrics\metrics-jvm\3.1.5\metrics-jvm-3.1.5.jar;D:\.m2\io\dropwizard\metrics\metrics-json\3.1.5\metrics-json-3.1.5.jar;D:\.m2\io\dropwizard\metrics\metrics-graphite\3.1.5\metrics-graphite-3.1.5.jar;D:\.m2\com\fasterxml\jackson\core\jackson-databind\2.6.7.1\jackson-databind-2.6.7.1.jar;D:\.m2\com\fasterxml\jackson\module\jackson-module-scala_2.11\2.6.7.1\jackson-module-scala_2.11-2.6.7.1.jar;D:\.m2\org\scala-lang\scala-reflect\2.11.8\scala-reflect-2.11.8.jar;D:\.m2\com\fasterxml\jackson\module\jackson-module-paranamer\2.7.9\jackson-module-paranamer-2.7.9.jar;D:\.m2\org\apache\ivy\ivy\2.4.0\ivy-2.4.0.jar;D:\.m2\oro\oro\2.0.8\oro-2.0.8.jar;D:\.m2\net\razorvine\pyrolite\4.13\pyrolite-4.13.jar;D:\.m2\net\sf\py4j\py4j\0.10.7\py4j-0.10.7.jar;D:\.m2\org\apache\spark\spark-tags_2.11\2.4.4\spark-tags_2.11-2.4.4.jar;D:\.m2\org\apache\commons\commons-crypto\1.0.0\commons-crypto-1.0.0.jar;D:\.m2\org\spark-project\spark\unused\1.0.0\unused-1.0.0.jar;D:\.m2\org\apache\spark\spark-streaming_2.11\2.4.4\spark-streaming_2.11-2.4.4.jar" TransformationsTest
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
19/12/02 15:09:34 INFO SparkContext: Running Spark version 2.4.4
19/12/02 15:09:35 INFO SparkContext: Submitted application: transformations test
19/12/02 15:09:35 INFO SecurityManager: Changing view acls to: lenovo,root
19/12/02 15:09:35 INFO SecurityManager: Changing modify acls to: lenovo,root
19/12/02 15:09:35 INFO SecurityManager: Changing view acls groups to: 
19/12/02 15:09:35 INFO SecurityManager: Changing modify acls groups to: 
19/12/02 15:09:35 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users  with view permissions: Set(lenovo, root); groups with view permissions: Set(); users  with modify permissions: Set(lenovo, root); groups with modify permissions: Set()
19/12/02 15:09:37 INFO Utils: Successfully started service 'sparkDriver' on port 4617.
19/12/02 15:09:37 INFO SparkEnv: Registering MapOutputTracker
19/12/02 15:09:37 INFO SparkEnv: Registering BlockManagerMaster
19/12/02 15:09:37 INFO BlockManagerMasterEndpoint: Using org.apache.spark.storage.DefaultTopologyMapper for getting topology information
19/12/02 15:09:37 INFO BlockManagerMasterEndpoint: BlockManagerMasterEndpoint up
19/12/02 15:09:37 INFO DiskBlockManager: Created local directory at C:\Users\lenovo\AppData\Local\Temp\blockmgr-23f07ea3-281d-4747-a259-33c756f52fab
19/12/02 15:09:37 INFO MemoryStore: MemoryStore started with capacity 896.4 MB
19/12/02 15:09:37 INFO SparkEnv: Registering OutputCommitCoordinator
19/12/02 15:09:37 INFO Utils: Successfully started service 'SparkUI' on port 4040.
19/12/02 15:09:37 INFO SparkUI: Bound SparkUI to 0.0.0.0, and started at http://192.168.89.1:4040
19/12/02 15:09:37 INFO Executor: Starting executor ID driver on host localhost
19/12/02 15:09:37 INFO Utils: Successfully started service 'org.apache.spark.network.netty.NettyBlockTransferService' on port 4638.
19/12/02 15:09:37 INFO NettyBlockTransferService: Server created on 192.168.89.1:4638
19/12/02 15:09:37 INFO BlockManager: Using org.apache.spark.storage.RandomBlockReplicationPolicy for block replication policy
19/12/02 15:09:37 INFO BlockManagerMaster: Registering BlockManager BlockManagerId(driver, 192.168.89.1, 4638, None)
19/12/02 15:09:37 INFO BlockManagerMasterEndpoint: Registering block manager 192.168.89.1:4638 with 896.4 MB RAM, BlockManagerId(driver, 192.168.89.1, 4638, None)
19/12/02 15:09:37 INFO BlockManagerMaster: Registered BlockManager BlockManagerId(driver, 192.168.89.1, 4638, None)
19/12/02 15:09:37 INFO BlockManager: Initialized BlockManager: BlockManagerId(driver, 192.168.89.1, 4638, None)
19/12/02 15:09:44 ERROR ReceiverTracker: Deregistered receiver for stream 0: Restarting receiver with delay 2000ms: Error connecting to SparkOnStandalone:9999 - java.net.ConnectException: Connection refused: connect
	at java.net.DualStackPlainSocketImpl.connect0(Native Method)
	at java.net.DualStackPlainSocketImpl.socketConnect(DualStackPlainSocketImpl.java:79)
	at java.net.AbstractPlainSocketImpl.doConnect(AbstractPlainSocketImpl.java:350)
	at java.net.AbstractPlainSocketImpl.connectToAddress(AbstractPlainSocketImpl.java:206)
	at java.net.AbstractPlainSocketImpl.connect(AbstractPlainSocketImpl.java:188)
	at java.net.PlainSocketImpl.connect(PlainSocketImpl.java:172)
	at java.net.SocksSocketImpl.connect(SocksSocketImpl.java:392)
	at java.net.Socket.connect(Socket.java:589)
	at java.net.Socket.connect(Socket.java:538)
	at java.net.Socket.<init>(Socket.java:434)
	at java.net.Socket.<init>(Socket.java:211)
	at org.apache.spark.streaming.dstream.SocketReceiver.onStart(SocketInputDStream.scala:61)
	at org.apache.spark.streaming.receiver.ReceiverSupervisor.startReceiver(ReceiverSupervisor.scala:149)
	at org.apache.spark.streaming.receiver.ReceiverSupervisor.start(ReceiverSupervisor.scala:131)
	at org.apache.spark.streaming.scheduler.ReceiverTracker$ReceiverTrackerEndpoint$$anonfun$9.apply(ReceiverTracker.scala:601)
	at org.apache.spark.streaming.scheduler.ReceiverTracker$ReceiverTrackerEndpoint$$anonfun$9.apply(ReceiverTracker.scala:591)
	at org.apache.spark.SparkContext$$anonfun$37.apply(SparkContext.scala:2212)
	at org.apache.spark.SparkContext$$anonfun$37.apply(SparkContext.scala:2212)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)

19/12/02 15:09:49 ERROR ReceiverTracker: Deregistered receiver for stream 0: Restarting receiver with delay 2000ms: Error connecting to SparkOnStandalone:9999 - java.net.ConnectException: Connection refused: connect
	at java.net.DualStackPlainSocketImpl.connect0(Native Method)
	at java.net.DualStackPlainSocketImpl.socketConnect(DualStackPlainSocketImpl.java:79)
	at java.net.AbstractPlainSocketImpl.doConnect(AbstractPlainSocketImpl.java:350)
	at java.net.AbstractPlainSocketImpl.connectToAddress(AbstractPlainSocketImpl.java:206)
	at java.net.AbstractPlainSocketImpl.connect(AbstractPlainSocketImpl.java:188)
	at java.net.PlainSocketImpl.connect(PlainSocketImpl.java:172)
	at java.net.SocksSocketImpl.connect(SocksSocketImpl.java:392)
	at java.net.Socket.connect(Socket.java:589)
	at java.net.Socket.connect(Socket.java:538)
	at java.net.Socket.<init>(Socket.java:434)
	at java.net.Socket.<init>(Socket.java:211)
	at org.apache.spark.streaming.dstream.SocketReceiver.onStart(SocketInputDStream.scala:61)
	at org.apache.spark.streaming.receiver.ReceiverSupervisor.startReceiver(ReceiverSupervisor.scala:149)
	at org.apache.spark.streaming.receiver.ReceiverSupervisor$$anonfun$restartReceiver$1.apply$mcV$sp(ReceiverSupervisor.scala:198)
	at org.apache.spark.streaming.receiver.ReceiverSupervisor$$anonfun$restartReceiver$1.apply(ReceiverSupervisor.scala:189)
	at org.apache.spark.streaming.receiver.ReceiverSupervisor$$anonfun$restartReceiver$1.apply(ReceiverSupervisor.scala:189)
	at scala.concurrent.impl.Future$PromiseCompletingRunnable.liftedTree1$1(Future.scala:24)
	at scala.concurrent.impl.Future$PromiseCompletingRunnable.run(Future.scala:24)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)

-------------------------------------------
Time: 1575270585000 ms
-------------------------------------------

19/12/02 15:09:53 ERROR ReceiverTracker: Deregistered receiver for stream 0: Restarting receiver with delay 2000ms: Error connecting to SparkOnStandalone:9999 - java.net.ConnectException: Connection refused: connect
	at java.net.DualStackPlainSocketImpl.connect0(Native Method)
	at java.net.DualStackPlainSocketImpl.socketConnect(DualStackPlainSocketImpl.java:79)
	at java.net.AbstractPlainSocketImpl.doConnect(AbstractPlainSocketImpl.java:350)
	at java.net.AbstractPlainSocketImpl.connectToAddress(AbstractPlainSocketImpl.java:206)
	at java.net.AbstractPlainSocketImpl.connect(AbstractPlainSocketImpl.java:188)
	at java.net.PlainSocketImpl.connect(PlainSocketImpl.java:172)
	at java.net.SocksSocketImpl.connect(SocksSocketImpl.java:392)
	at java.net.Socket.connect(Socket.java:589)
	at java.net.Socket.connect(Socket.java:538)
	at java.net.Socket.<init>(Socket.java:434)
	at java.net.Socket.<init>(Socket.java:211)
	at org.apache.spark.streaming.dstream.SocketReceiver.onStart(SocketInputDStream.scala:61)
	at org.apache.spark.streaming.receiver.ReceiverSupervisor.startReceiver(ReceiverSupervisor.scala:149)
	at org.apache.spark.streaming.receiver.ReceiverSupervisor$$anonfun$restartReceiver$1.apply$mcV$sp(ReceiverSupervisor.scala:198)
	at org.apache.spark.streaming.receiver.ReceiverSupervisor$$anonfun$restartReceiver$1.apply(ReceiverSupervisor.scala:189)
	at org.apache.spark.streaming.receiver.ReceiverSupervisor$$anonfun$restartReceiver$1.apply(ReceiverSupervisor.scala:189)
	at scala.concurrent.impl.Future$PromiseCompletingRunnable.liftedTree1$1(Future.scala:24)
	at scala.concurrent.impl.Future$PromiseCompletingRunnable.run(Future.scala:24)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)

-------------------------------------------
Time: 1575270590000 ms
-------------------------------------------

-------------------------------------------
Time: 1575270595000 ms
-------------------------------------------

19/12/02 15:09:57 ERROR ReceiverTracker: Deregistered receiver for stream 0: Restarting receiver with delay 2000ms: Error connecting to SparkOnStandalone:9999 - java.net.ConnectException: Connection refused: connect
	at java.net.DualStackPlainSocketImpl.connect0(Native Method)
	at java.net.DualStackPlainSocketImpl.socketConnect(DualStackPlainSocketImpl.java:79)
	at java.net.AbstractPlainSocketImpl.doConnect(AbstractPlainSocketImpl.java:350)
	at java.net.AbstractPlainSocketImpl.connectToAddress(AbstractPlainSocketImpl.java:206)
	at java.net.AbstractPlainSocketImpl.connect(AbstractPlainSocketImpl.java:188)
	at java.net.PlainSocketImpl.connect(PlainSocketImpl.java:172)
	at java.net.SocksSocketImpl.connect(SocksSocketImpl.java:392)
	at java.net.Socket.connect(Socket.java:589)
	at java.net.Socket.connect(Socket.java:538)
	at java.net.Socket.<init>(Socket.java:434)
	at java.net.Socket.<init>(Socket.java:211)
	at org.apache.spark.streaming.dstream.SocketReceiver.onStart(SocketInputDStream.scala:61)
	at org.apache.spark.streaming.receiver.ReceiverSupervisor.startReceiver(ReceiverSupervisor.scala:149)
	at org.apache.spark.streaming.receiver.ReceiverSupervisor$$anonfun$restartReceiver$1.apply$mcV$sp(ReceiverSupervisor.scala:198)
	at org.apache.spark.streaming.receiver.ReceiverSupervisor$$anonfun$restartReceiver$1.apply(ReceiverSupervisor.scala:189)
	at org.apache.spark.streaming.receiver.ReceiverSupervisor$$anonfun$restartReceiver$1.apply(ReceiverSupervisor.scala:189)
	at scala.concurrent.impl.Future$PromiseCompletingRunnable.liftedTree1$1(Future.scala:24)
	at scala.concurrent.impl.Future$PromiseCompletingRunnable.run(Future.scala:24)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)

19/12/02 15:10:01 ERROR ReceiverTracker: Deregistered receiver for stream 0: Restarting receiver with delay 2000ms: Error connecting to SparkOnStandalone:9999 - java.net.ConnectException: Connection refused: connect
	at java.net.DualStackPlainSocketImpl.connect0(Native Method)
	at java.net.DualStackPlainSocketImpl.socketConnect(DualStackPlainSocketImpl.java:79)
	at java.net.AbstractPlainSocketImpl.doConnect(AbstractPlainSocketImpl.java:350)
	at java.net.AbstractPlainSocketImpl.connectToAddress(AbstractPlainSocketImpl.java:206)
	at java.net.AbstractPlainSocketImpl.connect(AbstractPlainSocketImpl.java:188)
	at java.net.PlainSocketImpl.connect(PlainSocketImpl.java:172)
	at java.net.SocksSocketImpl.connect(SocksSocketImpl.java:392)
	at java.net.Socket.connect(Socket.java:589)
	at java.net.Socket.connect(Socket.java:538)
	at java.net.Socket.<init>(Socket.java:434)
	at java.net.Socket.<init>(Socket.java:211)
	at org.apache.spark.streaming.dstream.SocketReceiver.onStart(SocketInputDStream.scala:61)
	at org.apache.spark.streaming.receiver.ReceiverSupervisor.startReceiver(ReceiverSupervisor.scala:149)
	at org.apache.spark.streaming.receiver.ReceiverSupervisor$$anonfun$restartReceiver$1.apply$mcV$sp(ReceiverSupervisor.scala:198)
	at org.apache.spark.streaming.receiver.ReceiverSupervisor$$anonfun$restartReceiver$1.apply(ReceiverSupervisor.scala:189)
	at org.apache.spark.streaming.receiver.ReceiverSupervisor$$anonfun$restartReceiver$1.apply(ReceiverSupervisor.scala:189)
	at scala.concurrent.impl.Future$PromiseCompletingRunnable.liftedTree1$1(Future.scala:24)
	at scala.concurrent.impl.Future$PromiseCompletingRunnable.run(Future.scala:24)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, State, StateSpec, StreamingContext}

/**
  * 从检查点恢复流处理应用的中间结果?
  *
  */


// 测试的方法
// 点击main  nc -lk 7777 输入数据 观察console
// 终止虚拟机
// 点击main  nc -lk 7777 输入数据 观察console
// ok
object RecoveryDataWithCheckpoint {

  // main
  def main(args: Array[String]): Unit = {
    // 获取老的ssc或者创建一个新的ssc
    val newSSC = StreamingContext.getOrCreate("hdfs://SparkOnStandalone:9000/checkpoint4", () => {
      val conf = new SparkConf().setMaster("local[*]").setAppName("recovery data")
      val ssc = new StreamingContext(conf, Seconds(5))
      ssc.sparkContext.setLogLevel("ERROR")
      ssc.checkpoint("hdfs://SparkOnStandalone:9000/checkpoint4")
      val ds14 = ssc.socketTextStream("SparkOnStandalone", 7777)
      ds14 // Hello Hello Spark
        .flatMap(_.split(" "))
        .map((_, 1)) // (Hello,1)(Hello,1)(Spark,1)
        // key: 单词
        // value: 默认值 1
        // state: 累积的状态数据对象
        .mapWithState(StateSpec.function((key: String, value: Option[Int], state: State[Int]) => {
        // k 有累积状态数据  操作: 获取历史状态结果 + value
        // k 没有累积状态数据 操作: value就是当前状态结果
        var count = 0
        if (state.exists()) {
          count = state.get() + value.get
        } else {
          count = value.get
        }
        // count 最新的累积的结果
        state.update(count)
        // 返回结果
        (key, count) // DStream[(String,Int)]
      }))
        .checkpoint(Seconds(5))  // 每5秒检查点数据同步到HDFS 1次
        .print()
      ssc
    })

    newSSC.sparkContext.setLogLevel("ERROR")

    // 启动ssc
    newSSC.start()

    newSSC.awaitTermination()
  }
}
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{Seconds, State, StateSpec, StreamingContext}

/**
  * spark streaming的转换操作
  *
  * 父DStream --> 子DStream
  */

// 测试的方法
// 点击main  nc -lk 9999 输入数据 观察console
// ok
object TransformationsTest {

  // main
  def main(args: Array[String]): Unit = {

    // 1. 初始化streaming context
    val conf = new SparkConf().setMaster("local[*]").setAppName("transformations test")

    val ssc = new StreamingContext(conf, Seconds(5))

    ssc.sparkContext.setLogLevel("ERROR")

    // 2. 构建source dstream

    // val ds = ssc.socketTextStream("SparkOnStandalone", 9999);

    //3. 应用转换操作
    //--------------------------------------------------------------------------
    //    ds
    //      .flatMap(_.split(" ")) // 将line --> word
    //      .map((_, 1)) // word ---> (word,1)
    //      .filter(t2 => t2._1.startsWith("H")) // 保留DStream中以H开头的所有单词
    //      .repartition(5) // 将数据分为5个分区
    //      .print()
    //--------------------------------------------------------------------------


    //--------------------------------------------------------------------------
    // union 将两个DStream组合在一起返回一个新的DStream
    //    val ds1 = ssc.socketTextStream("SparkOnStandalone", 9999);
    //    val ds2 = ssc.socketTextStream("SparkOnStandalone", 8888);
    //    ds1
    //      .union(ds2)
    //      .print()

    //--------------------------------------------------------------------------

    //--------------------------------------------------------------------------
    // 返回微批RDD中元素的数量
    //    val ds3 = ssc.socketTextStream("SparkOnStandalone", 9999);
    //    ds3
    //      .count()
    //      .print()
    //--------------------------------------------------------------------------

    //--------------------------------------------------------------------------
    // reduce 对DStream每一个元素应用函数进行计算
    //    val ds4 = ssc.socketTextStream("SparkOnStandalone", 9999);
    //
    //    ds4
    //      .flatMap(_.split(" "))
    //      .map(strNum => strNum.toInt)
    //      .reduce((v1, v2) => v1 + v2) // 1 2 3 4 5
    //      .print()
    //--------------------------------------------------------------------------

    //--------------------------------------------------------------------------
    // countByValue  对DStream调用 返回DStream相同元素k的出现count (k,count)
    //    val ds6 = ssc.socketTextStream("SparkOnStandalone", 9999);
    //    ds6
    //      .flatMap(_.split(" "))   // Hello Hello Spark
    //      .countByValue(2)   // (Hello,2)
    //                                      // (Spark,1)
    //      .print()
    //--------------------------------------------------------------------------

    //--------------------------------------------------------------------------
    // reduceByKey 根据key应用计算函数
    //    val ds7 = ssc.socketTextStream("SparkOnStandalone", 9999);
    //    ds7
    //      .flatMap(_.split(" "))
    //      .map((_, 1))
    //      .reduceByKey(_ - _)
    //      .print()
    //--------------------------------------------------------------------------

    //--------------------------------------------------------------------------
    // join 两个DStream进行连接操作 【意义不大】  流 + 批 join
    //    val ds8 = ssc.socketTextStream("SparkOnStandalone", 9999).map((_, 1));  // (a,1)
    //    val ds9 = ssc.socketTextStream("SparkOnStandalone", 8888).map((_, 1));  // (a,1)
    //    ds8
    //      .join(ds9)  // (a,(1,1))  内连接
    //      .print()

    //--------------------------------------------------------------------------

    //--------------------------------------------------------------------------
    // cogroup
    //    val ds10 = ssc.socketTextStream("SparkOnStandalone", 9999).map((_, 1)); // (a,1)
    //    val ds11 = ssc.socketTextStream("SparkOnStandalone", 8888).map((_, 1)); // (a,1)
    //    ds10
    //      .cogroup(ds11) // (a,([1],[1]))  官方文档有误
    //      .print()
    //--------------------------------------------------------------------------

    //--------------------------------------------------------------------------
    // transform 对DStream的每一个微批RDD 应用RDD操作,并且返回一个新的DStream 【非常重点】
    // 特点: 将DStream转换为RDD,应用RDD的操作
    // 例子: 如有某系统实现抽奖功能, 但是系统黑名单的用户不允许参与抽奖
    //    1. 手机抢购  非法的用户 加入系统黑名单【用户名、IP】
    //    2. 抽奖请求(正常用户 + 黑名单用户)
    //       流(抽奖请求) + 批(黑名单)
    /*val ds12 = ssc.socketTextStream("SparkOnStandalone", 9999);


    val blacklist = List((1, "zs"), (2, "ww"), (3, "tq"))
    val blacklistRDD = ssc.sparkContext.makeRDD(blacklist)

    // 流格式:userid method url
    ds12
      .map(request => {
        val arr = request.split(" ")
        val userId: Int = arr(0).toInt
        val method: String = arr(1)
        val url: String = arr(2)
        (userId, (method, url))
      })
      .transform(streamRDD => { // 流 --> 流RDD.leftOuterJoin(批RDD)
        streamRDD.leftOuterJoin(blacklistRDD)
      })
      // 最终仅保留 白名单用户  黑名单用户: (1,((GET,/xxx),Some(zs)))
      //                    白名单用户: (4,((DELETE,/xxx2),None))
      .filter(t2 => t2._2._2.isEmpty) // 是不是为空?
      .map(t2 => (t2._1, t2._2._1._1, t2._2._1._2))
      .print()*/



    //--------------------------------------------------------------------------


    //--------------------------------------------------------------------------
    // updateStateByKey
    // 必须设置checkpoint  将状态数据持久化保存到HDFS中
        /*ssc.checkpoint("hdfs://SparkOnStandalone:9000/checkpoint")
        val ds13 = ssc.socketTextStream("SparkOnStandalone", 7777)
        ds13 // Hello Hello Spark
          .flatMap(_.split(" "))
          .map((_, 1)) // (Hello,1)(Hello,1)(Spark,1)
          // newValues = [1,1]
          // 更新前:state
          // 更新后:state + newValues.length
          // 元组第一个值:newValues 代表当前微批中的k相同的value集合
          // 元组第二个值:state 到目前为止k相同的value出现次数
          .updateStateByKey((newValues: Seq[Int], state: Option[Int]) => Some(state.getOrElse(0) + newValues.length))
          .print()*/
    //--------------------------------------------------------------------------

    //--------------------------------------------------------------------------
    // mapWithState
    ssc.checkpoint("hdfs://SparkOnStandalone:9000/checkpoint2")
    val ds14 = ssc.socketTextStream("SparkOnStandalone", 7777)
    ds14 // Hello Hello Spark
      .flatMap(_.split(" "))
      .map((_, 1)) // (Hello,1)(Hello,1)(Spark,1)
      // key: 单词
      // value: 默认值 1
      // state: 累积的状态数据对象
      .mapWithState(StateSpec.function((key: String, value: Option[Int], state: State[Int]) => {
      // k 有累积状态数据  操作: 获取历史状态结果 + value
      // k 没有累积状态数据 操作: value就是当前状态结果
      var count = 0
      if (state.exists()) {
        count = state.get() + value.get
      } else {
        count = value.get
      }
      // count 最新的累积的结果
      state.update(count)
      // 返回结果
      (key, count) // DStream[(String,Int)]
    }))
      .print()
    //--------------------------------------------------------------------------



    //4. 启动流应用
    ssc.start()

    //5. 优雅的关闭应用
    ssc.awaitTermination()
  }
}
package com.baizhi

import org.apache.spark.{SparkConf, SparkContext}

/**
  * spark版本的单词统计
  */

// day3的代码

// 测试方法
// 直接点击main运行
//
// ok
object WordCountApplicationOnLocalWithCache {

  // main
  def main(args: Array[String]): Unit = {

    // 1. 创建SparkContext,上下文对象提供spark应用运行环境信息
    val conf = new SparkConf()
      .setAppName("WordCount Apps")
      .setMaster("local[*]") // local 本地模式 模拟Spark应用运行 [*] 当前计算机的所有核心  cores 6

    val sc = new SparkContext(conf)

    // 2. 编写DAG计算任务  有向无环图(某逻辑开发 --》 多重计算 --》最终输出)
    val rdd = sc.textFile("hdfs://SparkOnStandalone:9000/text.txt")

    // 将血统祖宗RDD Cache
    rdd.cache()
    rdd.count() // 打印rdd中的元素数

    val start = System.currentTimeMillis()
    rdd.count()
    val end = System.currentTimeMillis()
    println("使用cache应用耗费时间:" + (end - start))  // 19ms


    // 取消cache
    rdd.unpersist()

    val start2 = System.currentTimeMillis()
    rdd
      .count()
    val end2 = System.currentTimeMillis()
    println("没有cache应用耗费时间:" + (end2 - start2))  // 24ms

    // 3. 释放资源
    sc.stop()
  }
}


  // 使用cache应用耗费时间:117
 // 没有cache应用耗费时间:247
package com.baizhi

import org.apache.spark.{SparkConf, SparkContext}

/**
  * spark版本的单词统计
  */

// day3的代码
// 怎样进行测试
// 点击main直接运行
// ok
object WordCountApplicationOnLocalWithCheckpoint {

  // main
  def main(args: Array[String]): Unit = {
    //1. 创建SparkContext,上下文对象提供spark应用运行环境信息
    val conf = new SparkConf()
      .setAppName("WordCount Apps")
      .setMaster("local[*]") // local 本地模式 模拟Spark应用运行 [*] 当前计算机的所有核心  cores 6

    val sc = new SparkContext(conf)

    // 设置了检查点目录
    sc.setCheckpointDir("hdfs://SparkOnStandalone:9000/checkpoint5")

    //2. 编写DAG计算任务  有向无环图(某逻辑开发 --》 多重计算 --》最终输出)
    val rdd = sc.textFile("hdfs://SparkOnStandalone:9000/text.txt")

    // rdd.cache()

    // 对需要进行检查点操作的RDD应用检查点操作
    // rdd.checkpoint()

    rdd.count()


    val mapRDD = rdd
      .flatMap(_.split(" "))
      .map((_, 1))

    mapRDD.cache()  // 双重容错: 对宽依赖的父RDD应用cache,然后对RDD设置了检查点
    mapRDD.checkpoint()

    mapRDD
      .groupByKey()
      .map(t2 => (t2._1, t2._2.size))
      .foreach(println)

    //3. 释放资源
    sc.stop()
  }
}


/**(Kafka,1)
(hadoop,1)
(hello,3)
(Flume,1)*/
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}

/**
  * 流处理应用
  */

// 测试方法
// 点击main --- nc -lk 7777
// 查看输出
// ok
object WordCountApplication {

  // main
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local[*]").setAppName("streaming wordcount")
    val ssc = new StreamingContext(conf, Seconds(5)) // 流数据每隔5秒 划分一个micro batch

    // 提高日志输出级别
    ssc.sparkContext.setLogLevel("ERROR")

    // 构建DStream
    // 通过获取TCP 套接字的请求数据,用以构建DStream
    val dStream = ssc.socketTextStream("SparkOnStandalone", 7777)
    // 对流数据应用计算操作
    dStream
      .flatMap(_.split(" "))
      .map((_, 1))
      .reduceByKey(_ + _)
      .print() // 打印计算结果

    // 启动流处理应用
    ssc.start()

    // 优雅关闭流处理应用
    ssc.awaitTermination()
  }
}
[root@SparkOnStandalone ~]# date
Mon Dec  2 07:34:44 EST 2019
[root@SparkOnStandalone ~]# date -s '2019-12-02 20:37:30'
Mon Dec  2 20:37:30 EST 2019
// 把时间写入到操作系统里面
[root@SparkOnStandalone ~]# clock -w

[root@SparkOnStandalone ~]# hdfs dfs -ls /
Found 7 items
-rw-r--r--   1 root supergroup       3952 2019-12-01 18:58 /README.md
drwxr-xr-x   - root supergroup          0 2019-12-02 06:22 /checkpoint
drwxr-xr-x   - root supergroup          0 2019-12-02 03:26 /checkpoint2
drwxr-xr-x   - root supergroup          0 2019-12-02 03:29 /checkpoint4
drwxr-xr-x   - root supergroup          0 2019-12-02 06:07 /checkpoint5
drwxr-xr-x   - root supergroup          0 2019-12-01 09:52 /result
-rw-r--r--   1 root supergroup         38 2019-12-01 08:30 /text.txt

```java

[root@SparkOnStandalone ~]# hdfs dfs -mkdir /data
[root@SparkOnStandalone ~]# hdfs dfs -ls /
Found 8 items
-rw-r--r--   1 root supergroup       3952 2019-12-01 18:58 /README.md
drwxr-xr-x   - root supergroup          0 2019-12-02 06:22 /checkpoint
drwxr-xr-x   - root supergroup          0 2019-12-02 03:26 /checkpoint2
drwxr-xr-x   - root supergroup          0 2019-12-02 03:29 /checkpoint4
drwxr-xr-x   - root supergroup          0 2019-12-02 06:07 /checkpoint5
drwxr-xr-x   - root supergroup          0 2019-12-02 20:41 /data
drwxr-xr-x   - root supergroup          0 2019-12-01 09:52 /result
-rw-r--r--   1 root supergroup         38 2019-12-01 08:30 /text.txt

```java

[root@SparkOnStandalone ~]# hdfs dfs -put text.txt /data
[root@SparkOnStandalone ~]# date
Mon Dec  2 20:51:12 EST 2019
[root@SparkOnStandalone ~]# date -s '2019-12-02 20:58:30'
Mon Dec  2 20:58:30 EST 2019
[root@SparkOnStandalone ~]# clock -w
[root@SparkOnStandalone ~]# date
Mon Dec  2 20:58:41 EST 2019
[root@SparkOnStandalone ~]# hdfs dfs -put text.txt /data
put: `/data/text.txt': File exists
[root@SparkOnStandalone ~]# hdfs dfs -rm -r -f /data/*
Deleted /data/text.txt
[root@SparkOnStandalone ~]# hdfs dfs -put text.txt /data
[root@SparkOnStandalone ~]# hdfs dfs -rm -r -f /data/*
Deleted /data/text.txt
[root@SparkOnStandalone ~]# date -s '2019-12-02 21:01:30'
Mon Dec  2 21:01:30 EST 2019
[root@SparkOnStandalone ~]# dste
-bash: dste: command not found
[root@SparkOnStandalone ~]# date
Mon Dec  2 21:01:36 EST 2019
[root@SparkOnStandalone ~]# hdfs dfs -put text.txt /data
[root@SparkOnStandalone ~]# hdfs dfs -rm -r -f /data/*
Deleted /data/text.txt
[root@SparkOnStandalone ~]# date -s '2019-12-02 08:05:30'
Mon Dec  2 08:05:30 EST 2019
[root@SparkOnStandalone ~]# date
Mon Dec  2 08:05:34 EST 2019
[root@SparkOnStandalone ~]# hdfs dfs -put text.txt /data
[root@SparkOnStandalone ~]#
(hadoop,1)
(Kafka,1)
(hello,3)
(Flume,1)

-------------------------------------------
Time: 1575291985000 ms
[root@HadoopNode00 ~]# cd /home/
[root@HadoopNode00 home]# ls
1.txt  2.txt  findbugs  flume  hadoop  hbase  hive  java  kafka  maven  protobuf  zk
[root@HadoopNode00 home]# cd zk/
[root@HadoopNode00 zk]# ls
data  data01  data02  data03  zookeeper-3.4.6
[root@HadoopNode00 zk]# cd zookeeper-3.4.6/
[root@HadoopNode00 zookeeper-3.4.6]# ls
bin          dist-maven       LICENSE.txt           src                       zookeeper.out
build.xml    docs             NOTICE.txt            zookeeper-3.4.6.jar
CHANGES.txt  ivysettings.xml  README_packaging.txt  zookeeper-3.4.6.jar.asc
conf         ivy.xml          README.txt            zookeeper-3.4.6.jar.md5
contrib      lib              recipes               zookeeper-3.4.6.jar.sha1
[root@HadoopNode00 zookeeper-3.4.6]# bin/zkServer.sh start conf/zk.cfg
JMX enabled by default
Using config: conf/zk.cfg
Starting zookeeper ... STARTED
[root@HadoopNode00 zookeeper-3.4.6]# bin/zkServer.sh status conf/zk.cfg
JMX enabled by default
Using config: conf/zk.cfg
Mode: standalone
[root@HadoopNode00 zookeeper-3.4.6]# jps
2059 QuorumPeerMain
2188 Jps
[root@HadoopNode00 zookeeper-3.4.6]# clear
[root@HadoopNode00 zookeeper-3.4.6]# cd ..
[root@HadoopNode00 zk]# cd ..
[root@HadoopNode00 home]# cd kafka/
[root@HadoopNode00 kafka]# ls
kafka_2.11-0.11.0.0
[root@HadoopNode00 kafka]# cd kafka_2.11-0.11.0.0/
[root@HadoopNode00 kafka_2.11-0.11.0.0]# bin/kafka-server-start.sh -daemon config/server.properties
[root@HadoopNode00 kafka_2.11-0.11.0.0]# jps
3493 Jps
3464 Kafka
2059 QuorumPeerMain
[root@HadoopNode00 kafka_2.11-0.11.0.0]# bin/kafka-topics.sh --list --zookeeper HadoopNode00:2181
__consumer_offsets
t01
[root@HadoopNode00 kafka_2.11-0.11.0.0]# bin/kafka-topics.sh --create --topic baizhi --partitions 1 --replication-factor 1 --zookeeper HadoopNode00:2181
Created topic "baizhi".
[root@HadoopNode00 kafka_2.11-0.11.0.0]# bin/kafka-topics.sh --list --zookeeper HadoopNode00:2181
__consumer_offsets
baizhi
t01
#准备生产者
[root@HadoopNode00 kafka_2.11-0.11.0.0]# bin/kafka-console-producer.sh --broker-list HadoopNode00:9092 --topic baizhi
>
>Heel Scala
(null,Heel Scala,0,0,baizhi,1575218723367)
>hello hadoop
(null,hello hadoop,0,1,baizhi,1575218775978)
[root@HadoopNode00 apache-flume-1.9.0-bin]# cp conf/demo01.conf conf/spark.conf
[root@HadoopNode00 apache-flume-1.9.0-bin]# ls conf/
agent001.conf  agent01.conf                          demo03Avro-Mem-Logger1.conf   demointerceptors.conf           flume-env.sh.template
agent002.conf  agent02.conf                          demo03Avro-Mem-Logger.conf    demoSelector.conf               log4j.properties
agent003.conf  demo01.conf                           demo04TAILDIR-File-HDFS.conf  flume-conf.properties.template  spark.conf
agent004.conf  demo02SpoolingDir-Mem-File_Roll.conf  demo05EXEC-Mem-Logger.conf    flume-env.ps1.template
[root@HadoopNode00 apache-flume-1.9.0-bin]# vi conf/spark.conf
[root@HadoopNode00 apache-flume-1.9.0-bin]# cat conf/spark.conf
a1.sources = r1
a1.sinks = k1
a1.channels = c1


a1.sources.r1.type = netcat
a1.sources.r1.bind = HadoopNode00
a1.sources.r1.port = 6666




a1.channels.c1.type = memory



a1.sinks.k1.type = logger



a1.sources.r1.channels = c1

a1.sinks.k1.channel = c1


a1.sinks.avroSink.type = avro
a1.sinks.avroSink.channel = c1
a1.sinks.avroSink.hostname = 192.168.89.1
a1.sinks.avroSink.port = 9999
[root@HadoopNode00 apache-flume-1.9.0-bin]# vi conf/spark.conf
[root@HadoopNode00 apache-flume-1.9.0-bin]# cdt conf/spark.conf
-bash: cdt: command not found
[root@HadoopNode00 apache-flume-1.9.0-bin]# cat conf/spark.conf
a1.sources = r1
a1.sinks = avroSink
a1.channels = c1

a1.sources.r1.type = netcat
a1.sources.r1.bind = HadoopNode00
a1.sources.r1.port = 6666

a1.channels.c1.type = memory

a1.sources.r1.channels = c1


a1.sinks.avroSink.type = avro
a1.sinks.avroSink.channel = c1
a1.sinks.avroSink.hostname = 192.168.89.1
a1.sinks.avroSink.port = 9999
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
Spark提供了与Hive集成的功能,可以在Spark中使用Hive的元数据、表和查询语言。要在Spark中使用Hive集成,您需要确保在Spark中启用Hive支持。 首先,您需要在启动Spark应用程序时配置Hive支持。您可以在SparkSession的配置中设置以下选项来启用Hive支持: ```python from pyspark.sql import SparkSession spark = SparkSession.builder \ .appName("Spark Hive Integration") \ .config("spark.sql.warehouse.dir", "/user/hive/warehouse") \ .enableHiveSupport() \ .getOrCreate() ``` 在上述示例中,我们通过将`spark.sql.warehouse.dir`配置设置为Hive仓库目录来启用Hive支持。 一旦启用了Hive支持,您可以使用SparkSession的`sql`方法执行Hive查询。例如,您可以执行以下操作来读取Hive表的数据: ```python df = spark.sql("SELECT * FROM database.table") ``` 在这里,`database.table`是您要查询的Hive表的名称。 需要注意的是,Spark会自动将Hive表中的数据加载为DataFrame,这样您就可以使用Spark的API进行数据处理和分析。 除了执行Hive查询,您还可以使用Spark的DataFrame API来创建、操作和管理Hive表。您可以使用`spark.catalog`来访问Hive的元数据信息,并使用DataFrame API来创建新表、插入数据等操作。 这就是Spark与Hive集成的基本概述。通过使用Spark和Hive的组合,您可以在Spark中利用Hive的元数据和查询能力,以及Spark强大的数据处理和分析功能。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值