初识Spark

最近开始学习Spark,在了解了Spark的相关知识后,编写了一个Spark的简单例子

对于Spark的基础概念,已有比较多的资料,可以参见以下链接

Spark的基础知识:https://www.cnblogs.com/qingyunzong/p/8886338.html

Spark的transformation和action算子:

https://blog.csdn.net/lin1270473045/article/details/80133363

https://www.cnblogs.com/qingyunzong/p/8922135.html

*注意,spark的transformation算子操作为lazy,在没有进行action操作时,之前所有的transformation操作都没有执行,只是记录了其执行的方式,在进行action操作时,会一并执行前面的transformation操作

 

实例一通过读取txt中的内容,并通过新建对象,将其由JavaRDD<String>转换为JavaRDD<Object>,对其进行数据库的相关查询操作,实例如下


import org.apache.spark.api.java.*;
import org.apache.spark.api.java.function.*;
import org.apache.spark.sql.*;
import static org.apache.spark.sql.functions.col;

public class StudentTest{
    public static void main(String[] args) {
        //新建一个spark对象。代替了之前版本的SparkContext。
        SparkSession spark = SparkSession.builder().master("local[*]").getOrCreate();
        //新建一个SQLContext用于操作sql语句
        SQLContext sqlcontext = new SQLContext(spark);
        //读取readme.txt文件的内容,转化为JavaRDD格式,赋值给变量strarr
        JavaRDD<String> strarr = spark.sparkContext().textFile("E:/study/spark/readme.txt", 1).toJavaRDD();
        JavaRDD<String> strarr2 = spark.sparkContext().textFile("E:/study/spark/readme2.txt", 1).toJavaRDD();
        //将strarr的基本类型JavaRDD<String>转换为JavaRDD<Student>类型,新建Student对象,并对其进行赋值
        JavaRDD<Student> stus1 = getJavaRDDStu(strarr);
        JavaRDD<SampleStudent> stus2 = getJavaRDDSam(strarr2);
        //从JavaRDD对象构建Dataset<Row>对象
        Dataset<Row> datasets = spark.createDataFrame(stus1,Student.class);
        Dataset<Row> datasets2 = spark.createDataFrame(stus2,SampleStudent.class);
        //将dataset和dataset2利用ID相等进行左连接
        Dataset<Row> leftresult = datasets.join(datasets2,datasets.col("ID").equalTo(datasets2.col("ID")),"left").drop(datasets2.col("ID"));
        //打印leftresult
        leftresult.show();
        //将列属性“intersert”,“sex”和“grade”为null的值赋值,
        Dataset<Row> result = leftresult.na().fill("eat",new String[]{"intersert"});
        result = result.na().fill("man",new String[]{"sex"}).na().fill(60,new String[]{"grade"});
        result.show();
        //利用withColumn()函数添加一个列属性,利用列属性"age"的值进行赋值
        result = result.withColumn("exAge", col("age"));
        result.show();
        result.createOrReplaceTempView("table");
        //利用sql语句进行Dataset<Row>操作
        Dataset<Row> temview2 = sqlcontext.sql("SELECT ID, avg(grade) FROM table group by ID");
        temview2.show();
        spark.close();
    }
    //构建Student类型的JavaRDD
    public static JavaRDD<Student> getJavaRDDStu(JavaRDD<String> strarr)
    {
        JavaRDD<Student> stus = strarr.map((Function<String, Student>) s -> {
            String[] attrs = s.split(",");
            Student stu = new Student();
            stu.setClassgrade(attrs[3]);
            stu.setID(attrs[0]);
            stu.setName(attrs[1]);
            stu.setAge(Integer.valueOf(attrs[2]));
            return stu;
        });
        return stus;
    }
    //构建SampleStudent类型的JavaRDD
    public static JavaRDD<SampleStudent> getJavaRDDSam(JavaRDD<String> strarr)
    {
        JavaRDD<SampleStudent> stus = strarr.map((Function<String, SampleStudent>)(s)->{
                String[] attrs = s.split(" ");
                SampleStudent stu1 = new SampleStudent();
                stu1.setID(attrs[0]);
                stu1.setSex(attrs[1]);
                stu1.setIntersert(attrs[2]);
                stu1.setGrade(Integer.valueOf(attrs[3]));
                return stu1;
        });
        return stus;
    }
}
//Student定义
public class Student {
        public String ID;
        public String name;
        public int age;
        public String classgrade;
        
        public String toString()
        {
            return (String)(ID + ":"+name + ":"+ age + ":"+ classgrade);
        }
        public String getID() {
            return ID;
        }
        public String getName()
        {
            return name;
        }
        public int getAge() {
            return age;
        }
        public String getClassgrade()
        {
            return classgrade;
        }
        public void setID(String ID) {
            this.ID = ID;
        }

        public void setName(String name) {
            this.name = name;
        }

        public void setAge(int age) {
            this.age = age;
        }

        public void setClassgrade(String classgrade) {
            this.classgrade = classgrade;
        }
}
//SampleStudent定义
public class SampleStudent
{
    private String ID;
    private String sex;
    private String intersert;
private int grade;

    public int getGrade() {
        return grade;
    }

    public void setGrade(int grade) {
        this.grade = grade;
    }

    public void setSex(String name) {
        this.sex = name;
    }

    public void setID(String ID) {
        this.ID = ID;
    }

    public void setIntersert(String intersert) {
        this.intersert = intersert;
    }

    public String getID() {
        return ID;
    }

    public String getSex() {
        return sex;
    }
    public String getIntersert() {
        return intersert;
    }
}

readme和readme2的内容

 

实例二,通过Spark读取本地文件,统计其中各个单词出现的次数。实例如下:


import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.SparkSession;
import scala.Tuple2;
import java.util.Arrays;

public class WordTest {
    public static void main(String[] args) {
        //新建spark对象
        SparkSession spark = SparkSession.builder().master("local[*]").getOrCreate();
        long startTime = System.currentTimeMillis();
        //c从本地目录中读取文本文件内容,并将其转化为JavaRDD类型
        JavaRDD<String> wordSets = spark.sparkContext()
                .textFile("E:/study/spark/Definitions.txt", 1).toJavaRDD();
        //利用flatMap将wordSets进行压扁操作,即String->String[],通过iterator进行单个String对象的访问,将其转为String对象。即String->String[]->String
        JavaRDD<String> wordConuts = wordSets.flatMap( row -> Arrays.asList(row.split(" ")).iterator());
        //利用mapToPair将String->Tuple2<String,Integer>,对WordCounts的每个元素进行该操作
        JavaPairRDD<String,Integer> wordPairs = wordConuts.mapToPair(row-> new Tuple2<>(row, 1));
        //对相同的String对应的Integer进行加和操作
        JavaPairRDD<String,Integer> wordByKey = wordPairs.reduceByKey((x,y)->x + y);
        System.out.println(System.currentTimeMillis()-startTime);
        //collect(),action操作,将分布在各个worker上任务进行的结果取回到driver程序,进行返回
        System.out.println(wordByKey.collect());
    }
}

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值