maven配置:
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>groupId</groupId>
<artifactId>SparkScalaTest</artifactId>
<packaging>pom</packaging>
<version>1.0-SNAPSHOT</version>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<scala.version>2.11.8</scala.version>
<java.version>1.8</java.version>
<spark.version>2.1.0</spark.version>
<hadoop.version>2.7.3</hadoop.version>
</properties>
<url>http://maven.apache.org</url>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.1.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-network-common_2.11</artifactId>
<version>2.1.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-network-shuffle_2.11</artifactId>
<version>2.1.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.1.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql-kafka-0-10_2.11</artifactId>
<version>2.1.0</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>6.0.2</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>2.1.0</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-metastore</artifactId>
<version>2.1.0</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-service</artifactId>
<version>2.1.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.1.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>2.1.0</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>1.2.4</version>
</dependency>
</dependencies>
</project>
wordcount:
package com.wc
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by tan on 2016/8/4.
*/
object WordCountScala {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("WordCount").setMaster("local")
val sc = new SparkContext(conf)
// sc.textFile("spark.txt").flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_).sortBy(-_._2).foreach(println(_))
//以行读入
val lines = sc.textFile("spark.txt")
//扁平化map处理(map后将集合中的数据取出)
val words = lines.flatMap(_.split(" "))
// val words = lines.flatMap(line => line.split(" "))
//将集合中的每个单词处理,设置为key-value元组
val spair = words.map((_,1))
// val spair = words.map(word => (word,1))
//以元组的key分组,然后聚合处理,最后以value值降序排序
val results = spair.reduceByKey(_+_).sortBy(-_._2)
// val results = spair.reduceByKey((x,y) => x+y).sortBy(x => -x._2)
// val results = spair.reduceByKey(_+_).map(tuple => (tuple._2,tuple._1)).sortByKey(false).map(tuple => (tuple._2,tuple._1))
//循环打印results集合中的数据
results.foreach(println(_))
val distinctResult = words.distinct().map(_ => 1).reduce(_+_)
println("distinctResult: "+distinctResult)
}
}