pom.xml
< repositories>
< repository>
< id> aliyun</ id>
< url> http://maven.aliyun.com/nexus/content/groups/public/</ url>
</ repository>
< repository>
< id> cloudera</ id>
< url> https://repository.cloudera.com/artifactory/cloudera-repos/</ url>
</ repository>
< repository>
< id> jboss</ id>
< url> http://repository.jboss.com/nexus/content/groups/public</ url>
</ repository>
</ repositories>
< properties>
< maven.compiler.source> 1.8</ maven.compiler.source>
< maven.compiler.target> 1.8</ maven.compiler.target>
< encoding> UTF-8</ encoding>
< scala.version> 2.11.8</ scala.version>
< scala.compat.version> 2.11</ scala.compat.version>
< hadoop.version> 2.7.4</ hadoop.version>
< spark.version> 2.2.0</ spark.version>
</ properties>
< dependencies>
< dependency>
< groupId> org.scala-lang</ groupId>
< artifactId> scala-library</ artifactId>
< version> ${scala.version}</ version>
</ dependency>
< dependency>
< groupId> org.apache.spark</ groupId>
< artifactId> spark-core_2.11</ artifactId>
< version> ${spark.version}</ version>
</ dependency>
< dependency>
< groupId> org.apache.spark</ groupId>
< artifactId> spark-sql_2.11</ artifactId>
< version> ${spark.version}</ version>
</ dependency>
< dependency>
< groupId> org.apache.spark</ groupId>
< artifactId> spark-hive_2.11</ artifactId>
< version> ${spark.version}</ version>
</ dependency>
< dependency>
< groupId> org.apache.spark</ groupId>
< artifactId> spark-hive-thriftserver_2.11</ artifactId>
< version> ${spark.version}</ version>
</ dependency>
< dependency>
< groupId> org.apache.spark</ groupId>
< artifactId> spark-streaming_2.11</ artifactId>
< version> ${spark.version}</ version>
</ dependency>
< dependency>
< groupId> org.apache.spark</ groupId>
< artifactId> spark-streaming-kafka-0-10_2.11</ artifactId>
< version> ${spark.version}</ version>
</ dependency>
< dependency>
< groupId> org.apache.spark</ groupId>
< artifactId> spark-sql-kafka-0-10_2.11</ artifactId>
< version> ${spark.version}</ version>
</ dependency>
< dependency>
< groupId> org.apache.hadoop</ groupId>
< artifactId> hadoop-client</ artifactId>
< version> 2.7.4</ version>
</ dependency>
< dependency>
< groupId> org.apache.hbase</ groupId>
< artifactId> hbase-client</ artifactId>
< version> 1.3.1</ version>
</ dependency>
< dependency>
< groupId> org.apache.hbase</ groupId>
< artifactId> hbase-server</ artifactId>
< version> 1.3.1</ version>
</ dependency>
< dependency>
< groupId> com.typesafe</ groupId>
< artifactId> config</ artifactId>
< version> 1.3.3</ version>
</ dependency>
< dependency>
< groupId> mysql</ groupId>
< artifactId> mysql-connector-java</ artifactId>
< version> 5.1.38</ version>
</ dependency>
</ dependencies>
< build>
< sourceDirectory> src/main/scala</ sourceDirectory>
< testSourceDirectory> src/test/scala</ testSourceDirectory>
< plugins>
< plugin>
< groupId> org.apache.maven.plugins</ groupId>
< artifactId> maven-compiler-plugin</ artifactId>
< version> 3.5.1</ version>
</ plugin>
< plugin>
< groupId> net.alchim31.maven</ groupId>
< artifactId> scala-maven-plugin</ artifactId>
< version> 3.2.2</ version>
< executions>
< execution>
< goals>
< goal> compile</ goal>
< goal> testCompile</ goal>
</ goals>
< configuration>
< args>
< arg> -dependencyfile</ arg>
< arg> ${project.build.directory}/.scala_dependencies</ arg>
</ args>
</ configuration>
</ execution>
</ executions>
</ plugin>
< plugin>
< groupId> org.apache.maven.plugins</ groupId>
< artifactId> maven-surefire-plugin</ artifactId>
< version> 2.18.1</ version>
< configuration>
< useFile> false</ useFile>
< disableXmlReport> true</ disableXmlReport>
< includes>
< include> **/*Test.*</ include>
< include> **/*Suite.*</ include>
</ includes>
</ configuration>
</ plugin>
< plugin>
< groupId> org.apache.maven.plugins</ groupId>
< artifactId> maven-shade-plugin</ artifactId>
< version> 2.3</ version>
< executions>
< execution>
< phase> package</ phase>
< goals>
< goal> shade</ goal>
</ goals>
< configuration>
< filters>
< filter>
< artifact> *:*</ artifact>
< excludes>
< exclude> META-INF/*.SF</ exclude>
< exclude> META-INF/*.DSA</ exclude>
< exclude> META-INF/*.RSA</ exclude>
</ excludes>
</ filter>
</ filters>
< transformers>
< transformer
implementation = " org.apache.maven.plugins.shade.resource.ManifestResourceTransformer" >
< mainClass> </ mainClass>
</ transformer>
</ transformers>
</ configuration>
</ execution>
</ executions>
</ plugin>
</ plugins>
</ build>
本地运行
package cn. itcast. sparkhello
import org. apache. spark. rdd. RDD
import org. apache. spark. { SparkConf, SparkContext}
object WordCount {
def main ( args: Array[ String] ) : Unit = {
val config = new SparkConf ( ) . setAppName ( "wc" ) . setMaster ( "local[*]" )
val sc = new SparkContext ( config)
sc. setLogLevel ( "WARN" )
val fileRDD: RDD[ String] = sc. textFile ( "D:\\授课\\190429\\资料\\data\\words.txt" )
val wordRDD: RDD[ String] = fileRDD. flatMap ( _. split ( " " ) )
val wordAndOneRDD: RDD[ ( String, Int) ] = wordRDD. map ( ( _, 1 ) )
val wordAndCount: RDD[ ( String, Int) ] = wordAndOneRDD. reduceByKey ( _+ _)
val result: Array[ ( String, Int) ] = wordAndCount. collect ( )
result. foreach ( println)
}
}
集群运行
package cn. itcast. sparkhello
import org. apache. spark. rdd. RDD
import org. apache. spark. { SparkConf, SparkContext}
object WordCount {
def main ( args: Array[ String] ) : Unit = {
val config = new SparkConf ( ) . setAppName ( "wc" )
val sc = new SparkContext ( config)
sc. setLogLevel ( "WARN" )
val fileRDD: RDD[ String] = sc. textFile ( args ( 0 ) )
val wordRDD: RDD[ String] = fileRDD. flatMap ( _. split ( " " ) )
val wordAndOneRDD: RDD[ ( String, Int) ] = wordRDD. map ( ( _, 1 ) )
val wordAndCount: RDD[ ( String, Int) ] = wordAndOneRDD. reduceByKey ( _+ _)
wordAndCount. saveAsTextFile ( args ( 1 ) )
}
}
打包
上传
执行命令提交到Spark-HA集群
/export/servers/spark-2.2.0-bin-2.6.0-cdh5.14.0/bin/spark-submit \
--class cn.itcast.sparkhello.WordCount \
--master spark://node01:7077,node02:7077 \
--executor-memory 1g \
--total-executor-cores 2 \
/root/wc.jar \
hdfs://node01:8020/aa.txt \
hdfs://node01:8020/cc
执行命令提交到YARN集群
/export/servers/spark-2.2.0-bin-2.6.0-cdh5.14.0/bin/spark-submit \
--class cn.itcast.sparkhello.WordCount \
--master yarn \
--deploy-mode cluster \
--driver-memory 1g \
--executor-memory 1g \
--executor-cores 2 \
--queue default \
/root/wc.jar \
hdfs://node01:8020/wordcount/input/words.txt \
hdfs://node01:8020/wordcount/output5
Java8版[了解]
import org. apache. spark. SparkConf;
import org. apache. spark. api. java. JavaPairRDD;
import org. apache. spark. api. java. JavaRDD;
import org. apache. spark. api. java. JavaSparkContext;
import scala. Tuple2;
import java. util. Arrays;
public class WordCount_Java {
public static void main ( String[ ] args) {
SparkConf conf = new SparkConf ( ) . setAppName ( "wc" ) . setMaster ( "local[*]" ) ;
JavaSparkContext jsc = new JavaSparkContext ( conf) ;
JavaRDD< String> fileRDD = jsc. textFile ( "D:\\授课\\190429\\资料\\data\\words.txt" ) ;
JavaRDD< String> wordRDD = fileRDD. flatMap ( s -> Arrays. asList ( s. split ( " " ) ) . iterator ( ) ) ;
JavaPairRDD< String, Integer> wordAndOne = wordRDD. mapToPair ( w -> new Tuple2< > ( w, 1 ) ) ;
JavaPairRDD< String, Integer> wordAndCount = wordAndOne. reduceByKey ( ( a, b) -> a + b) ;
wordAndCount. collect ( ) . forEach ( System. out: : println) ;
}
}
public class Test {
public static void main ( String[ ] args) {
new Thread (
new Runnable ( ) {
@Override
public void run ( ) {
System. out. println ( "java8" ) ;
}
}
) . start ( ) ;
new Thread (
( ) -> System. out. println ( "java8" )
) . start ( ) ;
}
}