Python与Java操作Spark

一、Python操作Spark

测试数据如下:

"id","name","money"
"1","aaa","900"
"2","bbb","1000"
"3","ccc","1000"
"5","ddd","1000"
"6","ddd","1000"

安装pyspark用于操作,findspark查找配置

1、RDD

import findspark

findspark.init()
from pyspark.sql import SparkSession

sparkSession = SparkSession.builder.appName("spark").master("local").getOrCreate()
rdd = sparkSession.read.option("header", True).csv("../files/account.csv").rdd
for row in rdd.collect():
    print(row)

运行结果:
在这里插入图片描述

2、SparkSQL

import findspark

findspark.init()
from pyspark.sql import SparkSession

sparkSession = SparkSession.builder.appName("spark").master("local").getOrCreate()
df = sparkSession.read.option("header", True).csv("../files/account.csv")
df.createTempView("account")
sparkSession.sql("select * from account").show()

运行结果:
在这里插入图片描述
3、读取hdfs上的文件

import findspark

findspark.init()
from pyspark.sql import SparkSession

sparkSession = SparkSession.builder.appName("spark").master("local").getOrCreate()
df = sparkSession.read.option("header", True).csv("hdfs://master:9000/testdata/account/account.csv")
df.createTempView("account")
sparkSession.sql("select * from account").show()

在这里插入图片描述

二、Java操作Spark

测试数据如下:

"id","name","money"
"1","aaa","900"
"2","bbb","1000"
"3","ccc","1000"
"5","ddd","1000"
"6","ddd","1000"

导入maven坐标:

<dependency>
	<groupId>org.apache.spark</groupId>
	<artifactId>spark-sql_2.12</artifactId>
	<version>2.4.7</version>
</dependency>

1、RDD

package com.it.spark_sql;

import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;

public class SparkDemo01 {
    public static void main(String[] args) {
        SparkConf conf = new SparkConf().setMaster("local").setAppName("spark");
        SparkContext sc = new SparkContext(conf);
        SparkSession sparkSession = new SparkSession(sc);
        JavaRDD<Row> rdd = sparkSession.read().option("header", true).csv("files/account.csv").toJavaRDD();
        for (Row row : rdd.collect()) {
            System.out.println(row);
        }
        sparkSession.close();
    }
}

运行结果:
在这里插入图片描述
2、SparkSQL

package com.it.spark_sql;

import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;

public class SparkDemo02 {
    public static void main(String[] args) {
        SparkConf conf = new SparkConf().setMaster("local").setAppName("spark");
        SparkContext sc = new SparkContext(conf);
        SparkSession sparkSession = new SparkSession(sc);
        Dataset<Row> ds = sparkSession.read().option("header", true).csv("files/account.csv");
        ds.createOrReplaceTempView("account");
        sparkSession.sql("select * from account").show();
        sparkSession.close();
    }
}

运行结果:
在这里插入图片描述
3、读取hdfs上的文件

package com.it.spark_sql;

import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;

public class SparkDemo03 {
    public static void main(String[] args) {
        SparkConf conf = new SparkConf().setMaster("local").setAppName("spark");
        SparkContext sc = new SparkContext(conf);
        SparkSession sparkSession = new SparkSession(sc);
        Dataset<Row> ds = sparkSession.read().option("header", true).csv("hdfs://master:9000/testdata/account/account.csv");
        ds.createOrReplaceTempView("account");
        sparkSession.sql("select * from account").show();
        sparkSession.close();
    }
}

在这里插入图片描述

三、案例

在这里插入图片描述

数据:

LiuYang,female,20
YuanJing,male,10
GuoYijun,male,5
CaiXuyu,female,50
Liyuan,male,20
FangBo,female,50
LiuYang,female,20
YuanJing,male,10
GuoYijun,male,50
CaiXuyu,female,50
FangBo,female,60
LiuYang,female,20
YuanJing,male,10
CaiXuyu,female,50
FangBo,female,50
GuoYijun,male,5
CaiXuyu,female,50
Liyuan,male,20
CaiXuyu,female,50
FangBo,female,50
LiuYang,female,20
YuanJing,male,10
FangBo,female,50
GuoYijun,male,50
CaiXuyu,female,50
FangBo,female,60

1、Python代码

import findspark

findspark.init()
from pyspark import SparkContext

sc = SparkContext()
result = sc.textFile("../files/inputdata.txt").map(
    lambda line: ('{}-{}'.format(line.split(",")[0], line.split(",")[1]), int(line.split(",")[2]))
).reduceByKey(lambda a, b: a + b) \
    .filter(lambda i: i[1] > 120) \
    .filter(lambda i: 'female' in i[0])\
    .collect()
for i in result:
    print(i)

运行结果:
在这里插入图片描述

2、Java代码:

package com.it.spark_sql;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;

public class SparkDemo05 {
    public static void main(String[] args) {
        SparkConf conf = new SparkConf().setMaster("local").setAppName("spark");
        JavaSparkContext jsc = new JavaSparkContext(conf);
        JavaPairRDD<String, Integer> female = jsc.textFile("files/inputdata.txt")
                .mapToPair(tuple2 -> {
                            String[] fields = tuple2.split(",");
                            return new Tuple2<>(fields[0] + "-" + fields[1], Integer.parseInt(fields[2]));
                        }
                )
                .reduceByKey(Integer::sum).filter(tuple2 -> tuple2._1.contains("female"))
                .filter(tuple2 -> tuple2._2 > 120);
        female.collect().forEach(System.out::println);
        jsc.close();
    }
}

运行结果:
在这里插入图片描述

  • 4
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值