最近开始学习Spark,在了解了Spark的相关知识后,编写了一个Spark的简单例子
对于Spark的基础概念,已有比较多的资料,可以参见以下链接
Spark的基础知识:https://www.cnblogs.com/qingyunzong/p/8886338.html
Spark的transformation和action算子:
https://blog.csdn.net/lin1270473045/article/details/80133363
https://www.cnblogs.com/qingyunzong/p/8922135.html
*注意,spark的transformation算子操作为lazy,在没有进行action操作时,之前所有的transformation操作都没有执行,只是记录了其执行的方式,在进行action操作时,会一并执行前面的transformation操作
实例一通过读取txt中的内容,并通过新建对象,将其由JavaRDD<String>转换为JavaRDD<Object>,对其进行数据库的相关查询操作,实例如下
import org.apache.spark.api.java.*;
import org.apache.spark.api.java.function.*;
import org.apache.spark.sql.*;
import static org.apache.spark.sql.functions.col;
public class StudentTest{
public static void main(String[] args) {
//新建一个spark对象。代替了之前版本的SparkContext。
SparkSession spark = SparkSession.builder().master("local[*]").getOrCreate();
//新建一个SQLContext用于操作sql语句
SQLContext sqlcontext = new SQLContext(spark);
//读取readme.txt文件的内容,转化为JavaRDD格式,赋值给变量strarr
JavaRDD<String> strarr = spark.sparkContext().textFile("E:/study/spark/readme.txt", 1).toJavaRDD();
JavaRDD<String> strarr2 = spark.sparkContext().textFile("E:/study/spark/readme2.txt", 1).toJavaRDD();
//将strarr的基本类型JavaRDD<String>转换为JavaRDD<Student>类型,新建Student对象,并对其进行赋值
JavaRDD<Student> stus1 = getJavaRDDStu(strarr);
JavaRDD<SampleStudent> stus2 = getJavaRDDSam(strarr2);
//从JavaRDD对象构建Dataset<Row>对象
Dataset<Row> datasets = spark.createDataFrame(stus1,Student.class);
Dataset<Row> datasets2 = spark.createDataFrame(stus2,SampleStudent.class);
//将dataset和dataset2利用ID相等进行左连接
Dataset<Row> leftresult = datasets.join(datasets2,datasets.col("ID").equalTo(datasets2.col("ID")),"left").drop(datasets2.col("ID"));
//打印leftresult
leftresult.show();
//将列属性“intersert”,“sex”和“grade”为null的值赋值,
Dataset<Row> result = leftresult.na().fill("eat",new String[]{"intersert"});
result = result.na().fill("man",new String[]{"sex"}).na().fill(60,new String[]{"grade"});
result.show();
//利用withColumn()函数添加一个列属性,利用列属性"age"的值进行赋值
result = result.withColumn("exAge", col("age"));
result.show();
result.createOrReplaceTempView("table");
//利用sql语句进行Dataset<Row>操作
Dataset<Row> temview2 = sqlcontext.sql("SELECT ID, avg(grade) FROM table group by ID");
temview2.show();
spark.close();
}
//构建Student类型的JavaRDD
public static JavaRDD<Student> getJavaRDDStu(JavaRDD<String> strarr)
{
JavaRDD<Student> stus = strarr.map((Function<String, Student>) s -> {
String[] attrs = s.split(",");
Student stu = new Student();
stu.setClassgrade(attrs[3]);
stu.setID(attrs[0]);
stu.setName(attrs[1]);
stu.setAge(Integer.valueOf(attrs[2]));
return stu;
});
return stus;
}
//构建SampleStudent类型的JavaRDD
public static JavaRDD<SampleStudent> getJavaRDDSam(JavaRDD<String> strarr)
{
JavaRDD<SampleStudent> stus = strarr.map((Function<String, SampleStudent>)(s)->{
String[] attrs = s.split(" ");
SampleStudent stu1 = new SampleStudent();
stu1.setID(attrs[0]);
stu1.setSex(attrs[1]);
stu1.setIntersert(attrs[2]);
stu1.setGrade(Integer.valueOf(attrs[3]));
return stu1;
});
return stus;
}
}
//Student定义
public class Student {
public String ID;
public String name;
public int age;
public String classgrade;
public String toString()
{
return (String)(ID + ":"+name + ":"+ age + ":"+ classgrade);
}
public String getID() {
return ID;
}
public String getName()
{
return name;
}
public int getAge() {
return age;
}
public String getClassgrade()
{
return classgrade;
}
public void setID(String ID) {
this.ID = ID;
}
public void setName(String name) {
this.name = name;
}
public void setAge(int age) {
this.age = age;
}
public void setClassgrade(String classgrade) {
this.classgrade = classgrade;
}
}
//SampleStudent定义
public class SampleStudent
{
private String ID;
private String sex;
private String intersert;
private int grade;
public int getGrade() {
return grade;
}
public void setGrade(int grade) {
this.grade = grade;
}
public void setSex(String name) {
this.sex = name;
}
public void setID(String ID) {
this.ID = ID;
}
public void setIntersert(String intersert) {
this.intersert = intersert;
}
public String getID() {
return ID;
}
public String getSex() {
return sex;
}
public String getIntersert() {
return intersert;
}
}
readme和readme2的内容
实例二,通过Spark读取本地文件,统计其中各个单词出现的次数。实例如下:
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.SparkSession;
import scala.Tuple2;
import java.util.Arrays;
public class WordTest {
public static void main(String[] args) {
//新建spark对象
SparkSession spark = SparkSession.builder().master("local[*]").getOrCreate();
long startTime = System.currentTimeMillis();
//c从本地目录中读取文本文件内容,并将其转化为JavaRDD类型
JavaRDD<String> wordSets = spark.sparkContext()
.textFile("E:/study/spark/Definitions.txt", 1).toJavaRDD();
//利用flatMap将wordSets进行压扁操作,即String->String[],通过iterator进行单个String对象的访问,将其转为String对象。即String->String[]->String
JavaRDD<String> wordConuts = wordSets.flatMap( row -> Arrays.asList(row.split(" ")).iterator());
//利用mapToPair将String->Tuple2<String,Integer>,对WordCounts的每个元素进行该操作
JavaPairRDD<String,Integer> wordPairs = wordConuts.mapToPair(row-> new Tuple2<>(row, 1));
//对相同的String对应的Integer进行加和操作
JavaPairRDD<String,Integer> wordByKey = wordPairs.reduceByKey((x,y)->x + y);
System.out.println(System.currentTimeMillis()-startTime);
//collect(),action操作,将分布在各个worker上任务进行的结果取回到driver程序,进行返回
System.out.println(wordByKey.collect());
}
}