独立运行文件: dhlTest.scala
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
object dhlTest {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("BeyondNothing_dhl")
val sc = new SparkContext(conf)
//获取数据
val DHLtwo = sc.textFile("file:///usr/local/spark/dhlTest")
DHLtwo.filter(_.trim().length>0) //trim()函数返回空格个数
.map(line=>(line.trim,""))
.groupByKey()
.sortByKey() //按key value的自然顺序排序
.keys.collect().foreach(println)
}
}
实现步骤
1.建立目录结构
mkdir ./dhlTest # 创建应用程序根目录
mkdir -p ./dhlTest/src/main/scala #创建所需的文件夹结构
2.添加配置信息
在sparkapp目录下创建文件 simple.sbt 并且添加如下内容:
name := "dhl_BeyondNothing"
version := "1.0"
scalaVersion := "2.11.12"
libraryDependencies += "org.apache.spark" %% "spark-core" % "2.4.0"
3.导入代码文件(dhlTest.scala)
将代码文件放在scala目录下:
4.在/usr/local/spark/dhlTest 路径下创建文件
5.打包并运行
编译打包
/usr/local/sbt/sbt package
运行
/usr/local/spark/bin/spark-submit --class "dhlTest" ~/dhlTest/target/scala-2.11/simple-project_2.11-1.0.jar 2>&1 | grep "2017"