之前为了搭建scala开发spark的环境花了几天的时间,终于搞定了,具体可以参考:http://www.cnblogs.com/ljy2013/p/4964201.html   。下面就是用一个示例来测试自己的开发环境了,于是就只用了大数据比较经典的例子:WordCount。下面详细说明一下:

1、首先安装之前搭建的环境,创建maven工程来写scala的代码。工程目录如下:

2、编写代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
package  com.yiban.datacenter.Spark_demo
 
 
import  org.apache.spark.SparkContext
import  org.apache.spark.SparkConf
import  org.apache.hadoop.conf.Configuration
import  org.apache.hadoop.fs.FileSystem
 
/**
  * @author ${user.name}
  */
object App {
   
   def foo(x : Array[String]) = x.foldLeft( "" )((a,b) => a + b)
   
   def main(args : Array[String]) {
     
     //hadoop configuration  没有这个在local模式下会报错
     val hadoopconf =  new  Configuration();
     hadoopconf.setBoolean( "fs.hdfs.impl.disable.cache" true );
     val fileSystem = FileSystem.get(hadoopconf);
     
     //spark configuration
     val conf =  new  SparkConf().setAppName( "wordcount" ).setMaster( "yarn-cluster" )    //这里采用yarn集群的方式运行
 
     
      val sc =  new  SparkContext(conf)
     
     
     val wordcount=sc.textFile( "/user/liujiyu/input" 1 ).flatMap(_.split( " " )).map(word=>(word, 1 )).reduceByKey(_+_).saveAsTextFile( "/user/liujiyu/sparkwordcountoutput" )
     
     
     val data = Array( 1 2 3 4 5 )
     val data2=Seq( 1 , 2 , 3 )
     val distData = sc.parallelize(data)
     
     
     distData.saveAsTextFile( "/user/liujiyu/spark-demo" )
 
   }
 
}

 3、pom.xml文件内容如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
   <modelVersion> 4.0 . 0 </modelVersion>
   <groupId>com.yiban.datacenter</groupId>
   <artifactId>Spark-demo</artifactId>
   <version> 0.0 . 1 -SNAPSHOT</version>
   <name>${project.artifactId}</name>
   <description>My wonderfull scala app</description>
   <inceptionYear> 2015 </inceptionYear>
   <licenses>
     <license>
       <name>My License</name>
       <url>http: //....</url>
       <distribution>repo</distribution>
     </license>
   </licenses>
 
   <properties>
     <maven.compiler.source> 1.6 </maven.compiler.source>
     <maven.compiler.target> 1.6 </maven.compiler.target>
     <encoding>UTF- 8 </encoding>
     <scala.version> 2.10 . 5 </scala.version>
     <scala.compat.version> 2.10 </scala.compat.version>
   </properties>
   <repositories>
     <repository>
         <id>cloudera-repo-releases</id>
         <url>https: //repository.cloudera.com/artifactory/repo/</url>
     </repository>
   </repositories>
   
   <dependencies>
     <dependency>
         <groupId>org.apache.spark</groupId>
         <artifactId> spark-core_2. 10 </artifactId>
         <version> 1.5 . 2 </version>
     </dependency>
     <dependency>
         <groupId>org.apache.hadoop</groupId>
         <artifactId> hadoop-client</artifactId>
         <version> 2.6 . 0 -cdh5. 4.4 </version>
     </dependency>
     
     <dependency>
       <groupId>org.scala-lang</groupId>
       <artifactId>scala-library</artifactId>
       <version>${scala.version}</version>
     </dependency>
 
     <!-- Test -->
     <dependency>
       <groupId>junit</groupId>
       <artifactId>junit</artifactId>
       <version> 4.11 </version>
       <scope>test</scope>
     </dependency>
     <dependency>
       <groupId>org.specs2</groupId>
       <artifactId>specs2-core_${scala.compat.version}</artifactId>
       <version> 2.4 . 16 </version>
       <scope>test</scope>
     </dependency>
     <dependency>
       <groupId>org.scalatest</groupId>
       <artifactId>scalatest_${scala.compat.version}</artifactId>
       <version> 2.2 . 4 </version>
       <scope>test</scope>
     </dependency>
   </dependencies>
 
   <build>
     <sourceDirectory>src/main/scala</sourceDirectory>
     <testSourceDirectory>src/test/scala</testSourceDirectory>
     <plugins>
       <plugin>
         <!-- see http: //davidb.github.com/scala-maven-plugin -->
         <groupId>net.alchim31.maven</groupId>
         <artifactId>scala-maven-plugin</artifactId>
         <version> 3.2 . 0 </version>
         <executions>
           <execution>
             <goals>
               <goal>compile</goal>
               <goal>testCompile</goal>
             </goals>
             <configuration>
               <args>
                 <arg>-make:transitive</arg>
                 <arg>-dependencyfile</arg>
                 <arg>${project.build.directory}/.scala_dependencies</arg>
               </args>
             </configuration>
           </execution>
         </executions>
       </plugin>
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-surefire-plugin</artifactId>
         <version> 2.18 . 1 </version>
         <configuration>
           <useFile> false </useFile>
           <disableXmlReport> true </disableXmlReport>
           <!-- If you have classpath issue like NoDefClassError,... -->
           <!-- useManifestOnlyJar> false </useManifestOnlyJar -->
           <includes>
             <include>** /*Test.*</include>
             <include>**/ *Suite.*</include>
           </includes>
         </configuration>
       </plugin>
     </plugins>
   </build>
</project>

 4、执行maven clean  package 对工程进行打包。

5、将对应打包好的文件放到集群上去运行

执行如下命令进行运行:

spark-submit  --class   "com.yiban.datacenter.Spark_demo.App"   --master   yarn-cluster    Spark-demo-0.0.1-SNAPSHOT.jar

运行结束,会在对应路径产生结果,查看hdfs对应路径结果即可。