之前为了搭建scala开发spark的环境花了几天的时间,终于搞定了,具体可以参考:http://www.cnblogs.com/ljy2013/p/4964201.html 。下面就是用一个示例来测试自己的开发环境了,于是就只用了大数据比较经典的例子:WordCount。下面详细说明一下:
1、首先安装之前搭建的环境,创建maven工程来写scala的代码。工程目录如下:
2、编写代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
|
package
com.yiban.datacenter.Spark_demo
import
org.apache.spark.SparkContext
import
org.apache.spark.SparkConf
import
org.apache.hadoop.conf.Configuration
import
org.apache.hadoop.fs.FileSystem
/**
* @author ${user.name}
*/
object App {
def foo(x : Array[String]) = x.foldLeft(
""
)((a,b) => a + b)
def main(args : Array[String]) {
//hadoop configuration 没有这个在local模式下会报错
val hadoopconf =
new
Configuration();
hadoopconf.setBoolean(
"fs.hdfs.impl.disable.cache"
,
true
);
val fileSystem = FileSystem.get(hadoopconf);
//spark configuration
val conf =
new
SparkConf().setAppName(
"wordcount"
).setMaster(
"yarn-cluster"
)
//这里采用yarn集群的方式运行
val sc =
new
SparkContext(conf)
val wordcount=sc.textFile(
"/user/liujiyu/input"
,
1
).flatMap(_.split(
" "
)).map(word=>(word,
1
)).reduceByKey(_+_).saveAsTextFile(
"/user/liujiyu/sparkwordcountoutput"
)
val data = Array(
1
,
2
,
3
,
4
,
5
)
val data2=Seq(
1
,
2
,
3
)
val distData = sc.parallelize(data)
distData.saveAsTextFile(
"/user/liujiyu/spark-demo"
)
}
}
|
3、pom.xml文件内容如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
|
<project xmlns=
"http://maven.apache.org/POM/4.0.0"
xmlns:xsi=
"http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation=
"http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"
>
<modelVersion>
4.0
.
0
</modelVersion>
<groupId>com.yiban.datacenter</groupId>
<artifactId>Spark-demo</artifactId>
<version>
0.0
.
1
-SNAPSHOT</version>
<name>${project.artifactId}</name>
<description>My wonderfull scala app</description>
<inceptionYear>
2015
</inceptionYear>
<licenses>
<license>
<name>My License</name>
<url>http:
//....</url>
<distribution>repo</distribution>
</license>
</licenses>
<properties>
<maven.compiler.source>
1.6
</maven.compiler.source>
<maven.compiler.target>
1.6
</maven.compiler.target>
<encoding>UTF-
8
</encoding>
<scala.version>
2.10
.
5
</scala.version>
<scala.compat.version>
2.10
</scala.compat.version>
</properties>
<repositories>
<repository>
<id>cloudera-repo-releases</id>
<url>https:
//repository.cloudera.com/artifactory/repo/</url>
</repository>
</repositories>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId> spark-core_2.
10
</artifactId>
<version>
1.5
.
2
</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId> hadoop-client</artifactId>
<version>
2.6
.
0
-cdh5.
4.4
</version>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<!-- Test -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>
4.11
</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.specs2</groupId>
<artifactId>specs2-core_${scala.compat.version}</artifactId>
<version>
2.4
.
16
</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.scalatest</groupId>
<artifactId>scalatest_${scala.compat.version}</artifactId>
<version>
2.2
.
4
</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<sourceDirectory>src/main/scala</sourceDirectory>
<testSourceDirectory>src/test/scala</testSourceDirectory>
<plugins>
<plugin>
<!-- see http:
//davidb.github.com/scala-maven-plugin -->
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>
3.2
.
0
</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
<configuration>
<args>
<arg>-make:transitive</arg>
<arg>-dependencyfile</arg>
<arg>${project.build.directory}/.scala_dependencies</arg>
</args>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>
2.18
.
1
</version>
<configuration>
<useFile>
false
</useFile>
<disableXmlReport>
true
</disableXmlReport>
<!-- If you have classpath issue like NoDefClassError,... -->
<!-- useManifestOnlyJar>
false
</useManifestOnlyJar -->
<includes>
<include>**
/*Test.*</include>
<include>**/
*Suite.*</include>
</includes>
</configuration>
</plugin>
</plugins>
</build>
</project>
|
4、执行maven clean package 对工程进行打包。
5、将对应打包好的文件放到集群上去运行
执行如下命令进行运行:
spark-submit --class "com.yiban.datacenter.Spark_demo.App" --master yarn-cluster Spark-demo-0.0.1-SNAPSHOT.jar
运行结束,会在对应路径产生结果,查看hdfs对应路径结果即可。