RDD和DataFrame转换(Java+Scala)

一:RDD与DataFrame转换

  1. 通过反射的方式来推断RDD元素中的元数据。因为RDD本身一条数据本身是没有元数据的,例如Person,而Person有name,id等,而record是不知道这些的,但是变成DataFrame背后一定知道,通过反射的方式就可以了解到背后这些元数据,进而转换成DataFrame。如何反射?
    Scala: 通过case class映射,在case class里面说我们这个RDD里面每个record的不同列的元数据是什么。
    Java: 如何描述数据的元数据?构建Java Bean,使用Java Bean构建元数据信息,然后变换成DataFrame,但是此种方法不可以构建DataFrame嵌套类型。
  2. 动态获取Schema,我们并不知道RDD的元数据信息,所以只能根据曾经运行时动态构建一份具体的元数据。然后将具体的元数据运行在存在的RDD上。而且这种情况比较常见。

    二:代码实战

    java

    package com.dt.spark.SparkApps.sql;
    import java.util.List;
    import org.apache.spark.SparkConf;
    import org.apache.spark.api.java.JavaRDD;
    import org.apache.spark.api.java.JavaSparkContext;
    import org.apache.spark.api.java.function.Function;
    import org.apache.spark.sql.DataFrame;
    import org.apache.spark.sql.Row;
    import org.apache.spark.sql.SQLContext;
    public class RDDToDataFrameByReflection {
    
    public static void main(String[] args) {
        SparkConf conf = new SparkConf().setMaster("local").setAppName("RDDToDataFrameByReflection");
        JavaSparkContext sc = new JavaSparkContext(conf);
        SQLContext sqlContext = new SQLContext(sc);
    
        //读取数据
        JavaRDD<String> lines = sc.textFile("E://persons.txt");
    
        JavaRDD<Person> persons = lines.map(new Function<String,Person>(){
    
            private static final long serialVersionUID = 1L;
    
            @Override
            public Person call(String line) throws Exception {
                String[] splited = line.split(",");
                Person p = new Person();
                p.setId(Integer.valueOf(splited[0].trim()));
                p.setName(splited[1].trim());
                p.setAge(Integer.valueOf(splited[2].trim()));
                return p;
            }
    
        });
    
        //第一个参数:RDD,第二个参数是JavaBean,Person类
        //第二参数就是封装的JavaBean,JavaBean中封装了Person的元数据信息,
        //通过第二个参数DataFrame也就获得了元数据信息。
        //在底层通过反射的方式获得Person的所有Fields,结合RDD本身,就生成了DataFrame
        DataFrame df = sqlContext.createDataFrame(persons, Person.class);
    
        df.registerTempTable("persons");
        DataFrame bigDatas = sqlContext.sql("select * from persons where age >= 6");
    
        //DataFrame => RDD
        JavaRDD<Row> bigDataRDD = bigDatas.javaRDD();
    
        JavaRDD<Person> result = bigDataRDD.map(new Function<Row,Person>(){
    
            private static final long serialVersionUID = 1L;
    
            @Override
            public Person call(Row row) throws Exception {
                //返回具体每条记录
                Person p = new Person();
    
                /**
                 * 由于数据在DataFrame会进行优化,里面会对元数据进行排序
                 * 顺序可能就不是id name age的顺序了。
                 */
                p.setId(row.getInt(1));
                p.setName(row.getString(2));
                p.setAge(row.getInt(0));
    
                return p;
            }
    
        });
    
        List<Person> personList = result.collect();
        for(Person p : personList){
            System.out.println(p);
        }
    
    }

}
或者


SparkSession sparkSession = SparkSession.builder()
                .appName("ads_chpp_fence_trips_count")
                .enableHiveSupport()
                .getOrCreate();
        JavaRDD
   
   
    
     reslutRdd = sparkSession.sql(String.format(fence_real_sql, table_name, dis, day)).javaRDD().filter(new Function
    
    
     
     () {
            //终点在眉山附近300千米或者起点在眉山300公里
            @Override
            public Boolean call(Row row) throws Exception {
                String e_fence_id = row.getString(7);
                String s_fence_id = row.getString(1);
                GeoCoord geoCoord = UberH3Util.h3ToGeoCenter(e_fence_id);
                GeoCoord geoCoord1 = UberH3Util.h3ToGeoCenter(s_fence_id);
                return Distance.getDistance(centerLon, centerLat, geoCoord.lng, geoCoord.lat) < distance
                        || Distance.getDistance(centerLon, centerLat, geoCoord1.lng, geoCoord1.lat) < distance;
            }
        }).mapToPair(new PairFunction
     
     
      
      >() {
            @Override
            public Tuple2
      
      
       
       > call(Row row) throws Exception {
                String vidAsKey = row.getString(0);
                String[] filedAsValues = {row.getString(1), row.getString(2) == null || "null".equals(row.getString(2)) || "".equals(row.getString(2)) ? "未标注" : row.getString(2), row.getString(3), row.getString(4), row.getString(5), row.getString(6), row.getString(7), row.getString(8) == null || "null".equals(row.getString(8)) || "".equals(row.getString(8)) ? "未标注" : row.getString(8), row.getString(9), row.getString(10), row.getString(11)};
                List
       
       
         ls = new ArrayList<>(); ls.add(filedAsValues); return new Tuple2<>(vidAsKey, ls); } }).reduceByKey(new Function2 
        
          , List 
         
           , List 
          
            >() { //相同key的list合并成一个[["","","",""...],[],[],.... ] @Override public List 
           
             call(List 
            
              strings, List 
             
               strings2) throws Exception { List 
              
                tempList = new ArrayList<>(); tempList.addAll(strings); tempList.addAll(strings2); return tempList; } }).map(new Function 
               
                 >, Row>() { @Override public Row call(Tuple2 
                
                  > kv) throws Exception { String vid = kv._1(); List 
                 
                   vList = kv._2(); Map 
                  
                    allStartProvinceCountMap = new HashMap<>(); //每个起点省计数 Map 
                   
                     allStartCityCountMap = new HashMap<>(); //每个起点市计数 Map 
                    
                      allStartFenceCountMap = new HashMap<>(); //每个起点市计数 Map 
                     
                       allEndProvinceCountMap = new HashMap<>(); //每个起点市计数 Map 
                      
                        allEndCityCountMap = new HashMap<>(); //每个起点市计数 Map 
                       
                         allEndFenceCountMap = new HashMap<>(); //每个起点市计数 int startProCount; int startCityCount; int startFenceCount; int endProCount; int endCityCount; int endFenceCount; for (String[] rowValues : vList) { //rowValuesExcept : [s_fence_id,s_fence_name,s_time,s_province_name,s_city_name,s_country_name,e_fence_id,e_fence_name, // e_province_name,e_city_name,e_country_name] 6 String s_province_name = rowValues[3]; String s_city_name = rowValues[4]; String s_fence_name = rowValues[1] + "-" + rowValues[0] + "-" + rowValues[3] + "-" + rowValues[4] + "-" + rowValues[5]; String e_province_name = rowValues[8]; String e_city_name = rowValues[9]; String e_fence_name = rowValues[7] + "-" + rowValues[6] + "-" + rowValues[8] + "-" + rowValues[9] + "-" + rowValues[10]; if (allStartProvinceCountMap.get(s_province_name) == null) { //改省没有被计数 startProCount = 1; allStartProvinceCountMap.put(s_province_name, startProCount); } else { startProCount = allStartProvinceCountMap.get(s_province_name); startProCount += 1; allStartProvinceCountMap.put(s_province_name, startProCount); } // 统计起点市 if (allStartCityCountMap.get(s_city_name) == null) { //改省没有被计数 startCityCount = 1; allStartCityCountMap.put(s_city_name, startCityCount); } else { startCityCount = allStartCityCountMap.get(s_city_name); startCityCount += 1; allStartCityCountMap.put(s_city_name, startCityCount); } //统计起点企业 if (allStartFenceCountMap.get(s_fence_name) == null) { startFenceCount = 1; allStartFenceCountMap.put(s_fence_name, startFenceCount); } else { startFenceCount = allStartFenceCountMap.get(s_fence_name); startFenceCount += 1; allStartFenceCountMap.put(s_fence_name, startFenceCount); } // ****************统计终点省******************** if (allEndProvinceCountMap.get(e_province_name) == null) { endProCount = 1; allEndProvinceCountMap.put(e_province_name, endProCount); } else { endProCount = allEndProvinceCountMap.get(e_province_name); endProCount += 1; allEndProvinceCountMap.put(e_province_name, endProCount); } // ****************统计终点市******************** if (allEndCityCountMap.get(e_city_name) == null) { endCityCount = 1; allEndCityCountMap.put(e_city_name, endCityCount); } else { endCityCount = allEndCityCountMap.get(e_city_name); endCityCount += 1; allEndCityCountMap.put(e_city_name, endCityCount); } // ****************统计终点市******************** if (allEndFenceCountMap.get(e_fence_name) == null) { endFenceCount = 1; allEndFenceCountMap.put(e_fence_name, endFenceCount); } else { endFenceCount = allEndFenceCountMap.get(e_fence_name); endFenceCount += 1; allEndFenceCountMap.put(e_fence_name, endFenceCount); } } String s_proResult = sortMapToStringAppend(allStartProvinceCountMap); String s_cityResult = sortMapToStringAppend(allStartCityCountMap); String s_fenceResult = sortMapToStringAppend(allStartFenceCountMap); String e_proResult = sortMapToStringAppend(allEndProvinceCountMap); String e_cityResult = sortMapToStringAppend(allEndCityCountMap); String e_fenceResult = sortMapToStringAppend(allEndFenceCountMap); return RowFactory.create( vid, s_proResult, s_cityResult, s_fenceResult, e_proResult, e_cityResult, e_fenceResult ); } }); /** * 动态构建DataFrame中的元数据,一般来说这里的字段可以来源自字符串,也可以来源于外部数据库 */ List 
                        
                          asList = Arrays.asList(//这里字段顺序一定要和上边对应起来 DataTypes.createStructField("vid", DataTypes.StringType, true), DataTypes.createStructField("s_pro_count", DataTypes.StringType, true), DataTypes.createStructField("s_city_count", DataTypes.StringType, true), DataTypes.createStructField("s_fence_name_count", DataTypes.StringType, true), DataTypes.createStructField("e_pro_count", DataTypes.StringType, true), DataTypes.createStructField("e_city_count", DataTypes.StringType, true), DataTypes.createStructField("e_fence_name_count", DataTypes.StringType, true) ); StructType schema = DataTypes.createStructType(asList); sparkSession.createDataFrame(reslutRdd, schema).registerTempTable("car_fence_300_temp"); sparkSession.sql("create table ads_chpp_dev.car_count_test_10 like car_fence_300_temp"); sparkSession.sql("insert into ads_chpp_dev.car_count_test_10 select * from car_fence_300_temp"); } 
                         
                        
                       
                      
                     
                    
                   
                  
                 
                
               
              
             
            
           
          
         
       
      
      
     
     
    
    
   
   
package com.dt.spark.SparkApps.sql;
import java.io.Serializable;

//因为底层是反射,要求JavaBean是public
//此时需要序列化,因为是分布式方式。
public class Person implements Serializable{

    private static final long serialVersionUID = 1L;
    private int id;
    private String name;
    private int age;    public int getId() {
        return id;
    }
    public void setId(int id) {
        this.id = id;
    }
    public String getName() {
        return name;
    }
    public void setName(String name) {
        this.name = name;
    }
    public int getAge() {
        return age;
    }
    public void setAge(int age) {
        this.age = age;
    }

    @Override
    public String toString() {
        return "Person [id=" + id + ", name=" + name + ", age=" + age + "]";
    }
}

scala实现

package com.dataguru.xzl.two.com.dt

import org.apache.spark.sql.SQLContext

import org.apache.spark.{SparkConf, SparkContext}

/**

 * Created by xzl on 2016/3/16.

 */

object RDD2DataFrameByReflection {

  //case class 要放在main方法外面

  case class Person(id: Int, name: String, age: Int)

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setMaster("local").setAppName("RDD2DataFrameByReflection")

    val sc = new SparkContext(conf)

    val sqlContext = new SQLContext(sc)

    // 导入语句,可以隐式地将RDD转化成DataFrame

    import sqlContext.implicits._

    val lines = sc.textFile("d://persons.txt")

    val df = lines.map(_.split(",")).map { splited =>

      Person(splited(0).trim().toInt, splited(1), splited(2).trim().toInt)

    }.toDF() //toDF("colName","",...)

    df.registerTempTable("persons")

    val bigDatas = sqlContext.sql("select * from persons where age >= 6")

    val personList = bigDatas.javaRDD.collect()

    for (p
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值