package com.jdjr.city.demo
import org.apache.spark.sql.SparkSession
/**
* @Auther: hongwei * @Date: 2018/11/9 16:31 * @Description: SparkSession使用
*/
object Test4 {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder()
.appName("ActionOperation")
.master("local")
.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
import spark.implicits._
val employee = spark.read.json("D:\\jdWork\\code\\MySpark\\src\\main\\resources\\1.txt")
// collect:将分布式存储在集群上的分布式数据集(比如dataset)中的所有数据都获取到driver端来
employee.collect().foreach {
println(_)
}
// count:对dataset中的记录数进行统计个数的操作
println(employee.count())
// first:获取数据集中的第一条数据
println(employee.first())
// foreach:遍历数据集中的每一条数据,对数据进行操作,这个跟collect不同,collect是将数据获取到driver端进行操作
// foreach是将计算操作推到集群上去分布式执行
// foreach(println(_))这种,真正在集群中执行的时候,是没用的,因为输出的结果是在分布式的集群中的,我们是看不到的
employee.foreach {
println(_)
}
// reduce:对数据集中的所有数据进行归约的操作,多条变成一条
// 用reduce来实现数据集的个数的统计
//println(employee.map(employee => 1).reduce(_ + _))
// show,默认将dataset数据打印前20条
employee.show()
// take,从数据集中获取指定条数
employee.take(3).foreach {
println(_)
}
}
}
pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.jdjr.city</groupId>
<artifactId>MySpark</artifactId>
<version>1.0-SNAPSHOT</version>
<repositories>
<repository>
<id>cloudera</id>
<url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
</repository>
</repositories>
<properties>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<encoding>UTF-8</encoding>
<scala.version>2.11.8</scala.version>
<spark.version>2.2.0</spark.version>
<hadoop.version>2.6.4</hadoop.version>
</properties>
<dependencies>
<!--<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>-->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.8</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.5</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.8.3</version>
</dependency>
<dependency>
<groupId>org.ansj</groupId>
<artifactId>ansj_seg</artifactId>
<version>5.0.4</version>
</dependency>
<dependency>
<groupId>com.geccocrawler</groupId>
<artifactId>gecco</artifactId>
<version>1.0.8</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
</dependencies>
<build>
<sourceDirectory>src/main/scala</sourceDirectory>
<testSourceDirectory>src/test/scala</testSourceDirectory>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.2.2</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
<configuration>
<args>
<!--scala 2.11不支持-->
<!-- <arg>-make:transitive</arg>-->
<arg>-dependencyfile</arg>
<arg>${project.build.directory}/.scala_dependencies</arg>
</args>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.4.3</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
<transformers>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>com.jdjr.city.demo.SearchPoi2</mainClass>
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
原文:https://blog.csdn.net/qq_22253209/article/details/83900319