pom.xml
< ? xml version= "1.0" encoding= "UTF-8" ? >
< project xmlns= "http://maven.apache.org/POM/4.0.0"
xmlns : xsi= "http://www.w3.org/2001/XMLSchema-instance"
xsi : schemaLocation= "http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" >
< modelVersion> 4.0 .0 < / modelVersion>
< groupId> org. example< / groupId>
< artifactId> untitled1< / artifactId>
< version> 1.0 - SNAPSHOT < / version>
< properties>
< maven. compiler. source> 8 < / maven. compiler. source>
< maven. compiler. target> 8 < / maven. compiler. target>
< / properties>
< dependencies>
< dependency>
< groupId> org. scala- lang< / groupId>
< artifactId> scala- library< / artifactId>
< version> 2.11 .8 < / version>
< / dependency>
< dependency>
< groupId> junit< / groupId>
< artifactId> junit< / artifactId>
< version> 4.4 < / version>
< scope> test< / scope>
< / dependency>
< dependency>
< groupId> org. specs< / groupId>
< artifactId> specs< / artifactId>
< version> 1.2 .5 < / version>
< scope> test< / scope>
< / dependency>
< dependency>
< groupId> org. apache. spark< / groupId>
< artifactId> spark- core_2. 11 < / artifactId>
< version> 2.1 .0 < / version>
< / dependency>
< dependency>
< groupId> org. apache. spark< / groupId>
< artifactId> spark- yarn_2. 11 < / artifactId>
< version> 2.1 .0 < / version>
< / dependency>
< / dependencies>
< / project>
wordcount scala
package org. wordcount
import org. apache. spark. { SparkConf, SparkContext}
object wordcount {
def main ( args: Array[ String] ) : Unit = {
val conf = new SparkConf ( ) . setMaster ( "local[*]" ) . setAppName ( "sparkcore_worldcount" )
val sc = new SparkContext ( conf)
val textfile = sc. makeRDD ( List ( "yang" , "zhang" , "zhang" , "yang" , "li" , "gao" ) )
val worlds = textfile. flatMap ( _. split ( " " ) )
val k2v = worlds. map ( ( _, 1 ) )
val result = k2v. reduceByKey ( _+ _)
result. collect ( ) . foreach ( println)
sc. stop ( )
}
}
hadoop/bin
#在hdfs中创建test_data文件夹
hdfs dfs - mkdir - p / test_data
#在hdfs系统中新建input文件夹:
hadoop fs - mkdir / input
#查看hdfs 的test_data文件夹的文件
hdfs dfs - ls / test_data
#查看hdfs 的part- 00000 文件
hdfs dfs - cat / output1/ part- 00000
hdfs dfs - cat / output2/ part- 00001
#将容器中的文件夹上传到hdfs 中
hdfs dfs - put / home/ test_data/ untitled1- 1.0 - SNAPSHOT . jar / test_data
hdfs dfs - put / home/ test_data/ WorldCount. txt / test_data/ input
hdfs dfs - put / home/ test_data/ WorldCount. txt / input
hdfs dfs - put / home/ wc6_gpb- 1.0 - SNAPSHOT . jar / input
spark/bin
spark- submit -- class org. wangyi. WordsCount. WordsCount -- master yarn -- deploy- mode client / home/ spark01- 1.0 - SNAPSHOT . jar hdfs: / / hbase- master. hadoop- docker: 54310 / input/ WorldCount. txt hdfs: / / hbase- master. hadoop- docker: 54310 / output2/
spark- submit -- class org. example. wc61_gpb -- master yarn -- deploy- mode client / home/ wc6_gpb- 1.0 - SNAPSHOT . jar hdfs: / / hbase- master. hadoop- docker: 54310 / input/ WorldCount. txt hdfs: / / hbase- master. hadoop- docker: 54310 / output4/
spark- submit -- class $1 -- master yarn -- deploy- mode client $2 $3 $4
bash test. sh org. example. wc61_gpb / home/ wc6_gpb- 1.0 - SNAPSHOT . jar hdfs: / / hbase- master. hadoop- docker: 54310 / input/ WorldCount. txt hdfs: / / hbase- master. hadoop- docker: 54310 / output50/
WordCount. jar
spark- submit -- class org. example. homework2 -- master yarn -- deploy- mode client / home/ gpb. jar hdfs: / / hbase- master. hadoop- docker: 54310 / a/ ml- 100k/ u. data hdfs: / / hbase- master. hadoop- docker: 54310 / dzy3/