1、maven引入,pom.xml配置
#版本定义 <properties> <java.version>1.8</java.version> <scala.version>2.11.12</scala.version> <spark.version>2.2.0</spark.version> <scala.tools.version>2.11</scala.tools.version> </properties>
#maven引入 <!--scala所需jar包 https://mvnrepository.com/artifact/org.scala-lang/scala-library --> <dependency> <groupId>org.scala-lang</groupId> <artifactId>scala-library</artifactId> <version>${scala.version}</version> </dependency>
<!-- spark所需jar包--> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_${scala.tools.version}</artifactId> <version>${spark.version}</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-streaming_${scala.tools.version}</artifactId> <version>${spark.version}</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-sql_${scala.tools.version}</artifactId> <version>${spark.version}</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-streaming-kafka-0-10_${scala.tools.version}</artifactId> <version>${spark.version}</version> </dependency>
|
2.idea sdk配置
file——project structure ——libraries 配置scala SDK版本,版本需要与pom文件中定义的版本一致
负责会报错
3.编写一个简单的单词统计案例测试开发环境是否正常
object WordCountEmpl {
def main(args: Array[String]): Unit = {
//1.创建sprak配置对象sparkconf并设置为本地测试模式,2个线程 val sparkconf:SparkConf = new SparkConf().setMaster("local[2]").setAppName("wordCountEmpl"); //2、设置spark连接类 val sc =new SparkContext(sparkconf); /*3、读取文件,形成一个初始的分布式数据集。textFile()通过输入数据的类型创建RDD, 文件里面的每一行就相当于RDD里面的每一个元素*/ val lines = sc.textFile("C:\\Users\\Administrator\\Desktop\\环保项目\\部署项目\\bigdata\\wry_data1.txt"); //对数据进行切分,生成一个个单词 val word:RDD[String] = lines.flatMap(_.split(" ")); //将单词生成一个元组->(word, 1), 便于计数统计 val tup:RDD[(String,Int)] = word.map((_,1)); // 开始聚合统计 val sumd:RDD[(String,Int)] = tup.reduceByKey(_+_); // 降序排序 val sorted:RDD[(String,Int)] = sumd.sortBy(_._2,false); // 结果输出 sorted.collect.foreach(println); // foreach是Scala的方法
// 释放sc资源 sc.stop();
} } |