一、为什么需要使用spark?
spark已经成为数据处理和算法建模的行业标准,主要原因在于spark可以很好的与java相结合,由于行业内存在大量java工程师,他们可以迅速的转为数据分析工程师和算法工程师。
spark中对数据的抽象是RDD,即弹性分布式数据集,文本文件、关系型数据库、非关系型数据库都可以被抽象成RDD。我们需要分析的数据确实来自各种格式的文件以及数据库,这种抽象非常适应我们当前的需求。
spark中的操作以函数式编程思想为基础,通过RDD1 --> RDD2,这种流式处理方式处理数据,特别适用于数据的分析和处理。相比于Hadoop中MapReduce,spark更像是面向对象,MapReduce更像是面向过程。RDD抽象层度更高,使用起来更加方便。
二、spark生态系统
分布式系统需要解决三个问题:存储、计算、管理。hadoop作为作为成熟的分布式系统在存储上使用HDFS、计算使用MapReduce、管理包括资源管理yarn、节点管理zookeeper等。基于HDFS分布式文件系统我们一般使用HBase作为数据库,基于MapReduce分布式计算模型我们一般会使用Hive作为计算框架。
Hive的缺点:
1、计算速度太慢,仅仅是HelloWord跑起来就很慢
2、使用SQL语法操作数据,处理细节上不够灵活,使用UDF也不太方便
3、Hive能够处理的数据是分区表,需要进行数据格式转换处理
spark与Hive核心功能重合,作为数据计算框架spark功能更强大。
1、spark-mllib(机器学习部分)
2、spark-streaming(微批处理框架)
三、wordCount
依赖管理
<!-- spark -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.3.0</version>
<scope>provided</scope>
</dependency>
代码:
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;
/**
* 单词计数
*
* @author xuanchi.lyf
*/
public class WordCount {
private static final Pattern SPACE = Pattern.compile(" ");
public static void main(String[] args) {
if (args.length < 1) {
System.err.println("Usage: JavaWordCount <file>");
System.exit(1);
}
SparkConf sparkConf = new SparkConf().setMaster("local").setAppName("JavaWordCount");
JavaSparkContext ctx = new JavaSparkContext(sparkConf);
JavaRDD<String> lines = ctx.textFile(args[0], 1);
JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterator<String> call(String s) {
return Arrays.asList(SPACE.split(s)).iterator();
}
});
JavaPairRDD<String, Integer> ones = words.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String s) {
return new Tuple2<>(s, 1);
}
});
JavaPairRDD<String, Integer> counts = ones.reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer integer, Integer integer2) {
return integer + integer2;
}
});
List<Tuple2<String, Integer>> output = counts.collect();
for (Tuple2<?, ?> tuple : output) {
System.out.println(tuple._1() + ": " + tuple._2());
}
ctx.stop();
}
}
打包:
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>3.1.0</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
</executions>
</plugin>
这里注意一点,上面的例子是打成jar包的方式提交到spark集群上,所以需要使用shade方式将依赖打进去,由于jar包运行在spark平台上,所以一般会将spark相关的包设置为provided,这样打包生成的文件就会小很多。按照jar提交任务是我们在生产环境下普遍使用的方式,但是每家公司或许在形式上可能会有些不同,当然spark也提供http接口的方式提交任务,但是从本质上讲大同小异。
本地运行:
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.Serializable;
/**
* spark相关处理
*
* @author xuanchi.lyf
*/
public class SparkManger implements Serializable {
private static final Logger logger = LoggerFactory.getLogger(SparkManger.class);
private transient JavaSparkContext javaSparkContext;
private transient SQLContext sqlContext;
private transient JavaStreamingContext streamingContext;
public SparkManger() {
SparkConf conf = new SparkConf();
conf.setMaster("local[*]");
conf.setAppName("spark-mllib-study");
conf.set("spark.memory.offHeap.enabled", "true");
conf.set("spark.memory.offHeap.size", "5g");
conf.set("spark.locality.wait.node", "4");
conf.set("spark.locality.wait.process", "6");
conf.set("spark.streaming.stopGracefullyOnShutdown", "true");
conf.set("spark.sql.catalogImplementation", "in-memory");
this.javaSparkContext = new JavaSparkContext(conf);
}
public JavaSparkContext getJavaSparkContext() {
return javaSparkContext;
}
public synchronized SQLContext getSqlContext() {
if (sqlContext == null) {
sqlContext = new SQLContext(javaSparkContext);
}
return sqlContext;
}
public synchronized JavaStreamingContext getStreamingContext() {
if (streamingContext == null) {
SparkConf conf = new SparkConf();
conf.setMaster("local[*]");
conf.setAppName("spark-mllib-study");
conf.set("spark.memory.offHeap.enabled", "true");
conf.set("spark.memory.offHeap.size", "5g");
conf.set("spark.locality.wait.node", "4");
conf.set("spark.locality.wait.process", "6");
conf.set("spark.streaming.stopGracefullyOnShutdown", "true");
conf.set("spark.sql.catalogImplementation", "in-memory");
this.streamingContext = new JavaStreamingContext(javaSparkContext, Durations.seconds(1));
}
return streamingContext;
}
/**
* 关闭资源
*/
public void close() {
if (sqlContext != null) {
sqlContext.clearCache();
}
javaSparkContext.close();
logger.info("spark context closed.");
}
}
import com.google.common.collect.Lists;
import com.lyf.spark.study.common.SparkManger;
import com.lyf.spark.study.common.util.CollectionUtil;
import org.apache.parquet.Strings;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import scala.Tuple2;
import java.util.List;
import java.util.Map;
import java.util.Objects;
/**
* wordCount
*
* @author xuanchi.lyf
*/
public class WordCount {
private static final String FILEPATH = "word-count.txt";
/**
* 统计词频
*/
public static void main(String[] args) {
SparkManger sparkManger = new SparkManger();
JavaSparkContext javaSparkContext = sparkManger.getJavaSparkContext();
Map<String, Integer> map = javaSparkContext.textFile(FILEPATH)
.flatMapToPair((PairFlatMapFunction<String, String, Integer>) line -> {
List<Tuple2<String, Integer>> list = Lists.newArrayList();
if (Strings.isNullOrEmpty(line)) {
return list.iterator();
}
String[] splits = line.split("\\s+");
for (String split : splits) {
list.add(new Tuple2<>(split.trim(), 1));
}
return list.iterator();
}).filter((Function<Tuple2<String, Integer>, Boolean>) Objects::nonNull)
.reduceByKey((Function2<Integer, Integer, Integer>) (v1, v2) -> v1 + v2)
.collectAsMap();
CollectionUtil.printMap(map);
sparkManger.close();
}
}
这种情况下可以本地直接运行,但是会抛出下面的异常:
Exception in thread "main" java.lang.NoClassDefFoundError: org/apache/spark/SparkConf
at com.lyf.spark.study.common.SparkManger.<init>(SparkManger.java:31)
at com.lyf.spark.study.WordCount.main(WordCount.java:30)
Caused by: java.lang.ClassNotFoundException: org.apache.spark.SparkConf
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:335)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
... 2 more
原因很明确,修改起来也很容易,只需要将<scope>provided</scope>去掉就可以了。但是这样不够友好,本地运行加上一行注释,打成jar包之前去掉注释,确实不够优雅。如果使用Intellj idea的话,可以使用这一招解决这个问题,将“include dependencies with "Provided" scope”勾上,就能够解决这个问题:
四、RDD
在处理spark问题上,其实我们大都是在和RDD打交道。在处理RDD的基本套路:从一个RDD转换成另一种格式RDD,每一个RDD都是不可变的。这种变换在spark上称为算子,常用算子包括但不限于:
1、map
2、flatmap
3、reduce
4、reduceByKey
5、join
6、union
7、zip
五、Dataset
Dateset是数据集,Dataset这个概念应该是借用R语言中数据集。
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.3.0</version>
</dependency>
一般来说我们都是通过RDD的方式生成数据集:
JavaRDD<Row> javaRDD = sparkContext.textFile(FILE_PATH).map((Function<String, Row>) line -> {
if (Strings.isNullOrEmpty(line)) {
return null;
}
String[] splits = line.split("\\s+");
double[] features = new double[7];
for (int i = 0; i < features.length; i++) {
features[i] = Double.valueOf(splits[i]);
}
int label = Integer.valueOf(splits[7]);
return RowFactory.create(label, Vectors.dense(features));
}).filter((Function<Row, Boolean>) Objects::nonNull);
StructType schema = new StructType(new StructField[]{
new StructField("label", DataTypes.IntegerType, Boolean.FALSE, Metadata.empty()),
new StructField("features", new VectorUDT(), Boolean.FALSE, Metadata.empty())
});
Dataset<Row> dataset = sqlContext.createDataFrame(javaRDD, schema);
dataset.show(Boolean.FALSE);
+-----+--------------------------------------------+
|label|features |
+-----+--------------------------------------------+
|1 |[15.26,14.84,0.871,5.763,3.312,2.221,5.22] |
|1 |[14.88,14.57,0.8811,5.554,3.333,1.018,4.956]|
|1 |[14.29,14.09,0.905,5.291,3.337,2.699,4.825] |
|1 |[13.84,13.94,0.8955,5.324,3.379,2.259,4.805]|
|1 |[16.14,14.99,0.9034,5.658,3.562,1.355,5.175]|
|1 |[14.38,14.21,0.8951,5.386,3.312,2.462,4.956]|
|1 |[14.69,14.49,0.8799,5.563,3.259,3.586,5.219]|
|1 |[14.11,14.1,0.8911,5.42,3.302,2.7,5.0] |
|1 |[16.63,15.46,0.8747,6.053,3.465,2.04,5.877] |
|1 |[16.44,15.25,0.888,5.884,3.505,1.969,5.533] |
|1 |[15.26,14.85,0.8696,5.714,3.242,4.543,5.314]|
|1 |[14.03,14.16,0.8796,5.438,3.201,1.717,5.001]|
|1 |[13.89,14.02,0.888,5.439,3.199,3.986,4.738] |
|1 |[13.78,14.06,0.8759,5.479,3.156,3.136,4.872]|
|1 |[13.74,14.05,0.8744,5.482,3.114,2.932,4.825]|
|1 |[14.59,14.28,0.8993,5.351,3.333,4.185,4.781]|
|1 |[13.99,13.83,0.9183,5.119,3.383,5.234,4.781]|
|1 |[15.69,14.75,0.9058,5.527,3.514,1.599,5.046]|
|1 |[14.7,14.21,0.9153,5.205,3.466,1.767,4.649] |
|1 |[12.72,13.57,0.8686,5.226,3.049,4.102,4.914]|
+-----+--------------------------------------------+
一个完整的例子:
import java.util.Objects;
/**
* dataFrame添加一列
*
* @author xuanchi.lyf
*/
public class DataFrameStudy {
private static final String FILE_PATH = "data-frame.txt";
private JavaSparkContext sparkContext;
private SQLContext sqlContext;
private DataFrameStudy() {
SparkManger sparkManger = new SparkManger();
this.sparkContext = sparkManger.getJavaSparkContext();
this.sqlContext = sparkManger.getSqlContext();
}
/**
* 构造数据集
*/
private Dataset<Row> buildDataset() {
JavaRDD<Row> javaRDD = sparkContext.textFile(FILE_PATH).map((Function<String, Row>) line -> {
if (Strings.isNullOrEmpty(line)) {
return null;
}
String[] splits = line.split("\\s+");
int col1 = Integer.valueOf(splits[0]);
int col2 = Integer.valueOf(splits[1]);
int col3 = Integer.valueOf(splits[2]);
int col4 = Integer.valueOf(splits[3]);
return RowFactory.create(col1, col2, col3, col4);
}).filter((Function<Row, Boolean>) Objects::nonNull);
StructType schema = new StructType(new StructField[]{
new StructField("col1", DataTypes.IntegerType, Boolean.FALSE, Metadata.empty()),
new StructField("col2", DataTypes.IntegerType, Boolean.FALSE, Metadata.empty()),
new StructField("col3", DataTypes.IntegerType, Boolean.FALSE, Metadata.empty()),
new StructField("col4", DataTypes.IntegerType, Boolean.FALSE, Metadata.empty())
});
Dataset<Row> dataset = sqlContext.createDataFrame(javaRDD, schema);
dataset.show(Boolean.FALSE);
return dataset;
}
private void work() {
Dataset<Row> dataset = buildDataset();
// 按照一列做一次变化
Dataset<Row> last1 = dataset.withColumn("col5", dataset.col("col1").plus(1));
last1.show(Boolean.FALSE);
// 使用sql
sqlContext.registerDataFrameAsTable(dataset, "temp");
dataset.sqlContext().sql("select col1, col2, col3, col4, col1 + col2 + col3 + col4 as col5 from temp").show(Boolean.FALSE);
// 转成RDD在转成dataset
JavaRDD<Row> javaRDD = dataset.toJavaRDD().map((Function<Row, Row>) row -> {
int col1 = row.getInt(0);
int col2 = row.getInt(1);
int col3 = row.getInt(2);
int col4 = row.getInt(3);
int col5 = col1 + col2 + col3 + col4;
return RowFactory.create(col1, col2, col3, col4, col5);
});
StructType schema = new StructType(new StructField[]{
new StructField("col1", DataTypes.IntegerType, Boolean.FALSE, Metadata.empty()),
new StructField("col2", DataTypes.IntegerType, Boolean.FALSE, Metadata.empty()),
new StructField("col3", DataTypes.IntegerType, Boolean.FALSE, Metadata.empty()),
new StructField("col4", DataTypes.IntegerType, Boolean.FALSE, Metadata.empty()),
new StructField("col5", DataTypes.IntegerType, Boolean.FALSE, Metadata.empty())
});
Dataset<Row> last2 = sqlContext.createDataFrame(javaRDD, schema);
last2.show(Boolean.FALSE);
}
public static void main(String[] args) {
DataFrameStudy dataFrameStudy = new DataFrameStudy();
dataFrameStudy.work();
}
}
data-frame.txt文件格式:
1 2 3 4
2 2 3 4
4 5 6 7
8 8 8 8
1 2 3 4