Spark SQL之RDD转换DataFrame的方法

RDD转换DataFrame之Reflection方法

第一种方式是使用反射的方式,用反射去推倒出来RDD里面的schema。这个方式简单,但是不建议使用,因为在工作当中,使用这种方式是有限制的。

对于以前的版本来说,case class最多支持22个大数据培训字段如果超过了22个字段,我们就必须要自己开发一个类,实现product接口才行。因此这种方式虽然简单,但是不通用;因为生产中的字段是非常非常多的,是不可能只有20来个字段的。

 

//Java
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;


import javax.jnlp.PersistenceService;
import javax.xml.crypto.Data;


public class rddtoDFreflectionJava {
    public static void main(String[] args) {
        SparkSession spark = SparkSession
                .builder()
                .appName("program")
                .master("local").config("spark.sql.warehouse.dir", "file:/Users/zhangjingyu/Desktop/Spark架构/spark-warehouse")
                .getOrCreate();
        String Path = "file:/Users/zhangjingyu/Desktop/spark-2.4.0/examples/src/main/resources/people.txt";


        JavaRDD<PersonJava> personRDD = Spark.read().textFile(Path).javaRDD().map(line -> {
            String name = line.split(",")[0];
            Long age = Long.valueOf(line.split(",")[1].trim());
            PersonJava person = new PersonJava();
            person.setName(name);
            person.setAge(age);
            return person;
        });
/**
 *         JavaRDD<PersonJava> personRdd = Spark.read().textFile(Path).javaRDD().map(new Function<String, PersonJava>() {
 *             @Override
 *             public PersonJava call(String line) throws Exception {
 *                 String name = line.split(",")[0];
 *                 Long age = Long.valueOf(line.split(",")[1].trim());
 *                 PersonJava person = new PersonJava();
 *                 person.setName(name);
 *                 person.setAge(age);
 *                 return person;
 *             }
 *         });
 */
        Dataset<Row> personDF = Spark.createDataFrame(personRDD,PersonJava.class);
        personDF.createOrReplaceTempView("test");
        Dataset<Row> ResultDF = Spark.sql("select * from test a where a.age < 30");
        ResultDF.show();




        JavaRDD<PersonJava> ResultRDD = ResultDF.javaRDD().map(line -> {
            PersonJava person = new PersonJava();
            person.setName(line.getAs("name"));
            person.setAge(line.getAs("age"));
            return person;
        });


        for (PersonJava personJava : ResultRDD.collect()) {
            System.out.println(personJava.getName()+":"+personJava.getAge());
        }


/**
 *         JavaRDD<PersonJava> resultRdd = ResultDF.javaRDD().map(new Function<Row, PersonJava>() {
 *             @Override
 *             public PersonJava call(Row row) throws Exception {
 *                 PersonJava person = new PersonJava();
 *                 String name = row.getAs("name");
 *                 long age = row.getAs("age");
 *                 person.setName(name);
 *                 person.setAge(age);
 *                 return person;
 *             }
 *         });
 *         resultRdd.foreach(new VoidFunction<PersonJava>() {
 *             @Override
 *             public void call(PersonJava personJava) throws Exception {
 *                 System.out.println(personJava.getName()+":"+personJava.getAge());
 *             }
 *         });
 */
    }
}


//Scala
object rddtoDFreflectionScala {
  case class Person(name : String , age : Long)


  def main(args: Array[String]): Unit = {
    val spark = CommSparkSessionScala.getSparkSession()
    val path = "file:/Users/zhangjingyu/Desktop/spark-2.4.0/examples/src/main/resources/people.txt"
    import spark.implicits._;
    val personDF = spark.sparkContext.textFile(path).map(row => row.split(",")).map(line => {
      Person(line(0),line(1).trim.toLong)
    }).toDF
    personDF.createOrReplaceTempView("test")
    val resultDF = spark.sql("select * from test a where a.age > 20")
    val resultrdd = resultDF.rdd.map(x =>{
      val name = x.getAs[String]("name")
      val age = x.getAs[Long]("age")
      Person(name,age)
    })


    for (elem <- resultrdd.collect()) {
      System.out.println(elem.name+" : "+ elem.age)
    }
  }
}

RDD转换DataFrame之Programm方式

创建一个DataFrame,使用编程的方式,这个方式用的非常多。通过编程方式指定schema ,对于第一种方式的schema其实定义在了case class里面了。

//Java
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import scala.Tuple2;




import java.util.ArrayList;
import java.util.List;


public class rddtoDFprogrammJava {
    public static void main(String[] args) {


        SparkSession spark = SparkSession
                .builder()
                .appName("program")
                .master("local").config("spark.sql.warehouse.dir", "file:/Users/zhangjingyu/Desktop/Spark架构/spark-warehouse")
                .getOrCreate();
        String Path = "file:/Users/zhangjingyu/Desktop/spark-2.4.0/examples/src/main/resources/people.txt";


        //创建列属性
        List<StructField> fields = new ArrayList<>();
        StructField structField_name = DataTypes.createStructField("name", DataTypes.StringType, true);
        StructField structField_age = DataTypes.createStructField("age", DataTypes.LongType, true);
        fields.add(structField_name);
        fields.add(structField_age);
        StructType scheme = DataTypes.createStructType(fields);


        JavaRDD PersonRdd = spark.read().textFile(Path).javaRDD().map(x -> {
            String[] lines = x.split(",");
            return RowFactory.create(lines[0], Long.valueOf(lines[1].trim()));
        });


        Dataset<Row> PersonDF = spark.createDataFrame(PersonRdd, scheme);
        PersonDF.createOrReplaceTempView("program");
        Dataset<Row> ResultDF = spark.sql("select * from program ");
        ResultDF.show();


        for (Row row : ResultDF.javaRDD().collect()) {
            System.out.println(row);
        }
    }
}


//Scala


import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}


object rddtoDFprogrammScala {
  def main(args: Array[String]): Unit = {
    val spark = CommSparkSessionScala.getSparkSession()
    val path = "file:/Users/zhangjingyu/Desktop/spark-2.4.0/examples/src/main/resources/people.txt"
    val scheme = StructType(Array(
      StructField("name",StringType,true),
      StructField("age",LongType,true)
    ))
    val rdd = spark.sparkContext.textFile(path).map(line => line.split(",")).map(x => {
      Row(x(0),x(1).trim.toLong)
    })
    val PersonDF = spark.createDataFrame(rdd,scheme)
    PersonDF.createOrReplaceTempView("person")
    val resultDF = spark.sql("select * from person a where a.age < 30")
    for (elem <- resultDF.collect()) {
      System.out.println(elem.get(0)+":"+elem.get(1))
    }
  }
}

原创作者:张景宇

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值