RDD动态转换成DataFrame

最新推荐文章于 2024-06-22 14:53:57 发布

yiwei00000

最新推荐文章于 2024-06-22 14:53:57 发布

阅读量1.4k

点赞数

分类专栏：大数据文章标签： java RDD Scala DataFrame ROW

本文链接：https://blog.csdn.net/yiwei00000/article/details/50937758

版权

大数据专栏收录该内容

2 篇文章 1 订阅

订阅专栏

java版本：
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;

import java.util.ArrayList;
import java.util.List;

/**
* Created by rong on 2016/3/19.
*/
public class RDD2DataFrameByProgrammatically {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName(“RDD2DataFrame”).setMaster(“local”);
JavaSparkContext sc= new JavaSparkContext(conf);

    JavaRDD<String> lines = sc.textFile("C://Users//rong//Desktop//persons.txt");
    SQLContext sqlContext = new SQLContext(sc);
    JavaRDD<Row> javaRdd = lines.map(new Function<String, Row>() {
        public Row call(String line) throws Exception {
            String[] str = line.split(",");
            return RowFactory.create(Integer.valueOf(str[0]),str[1],Integer.valueOf(str[2]));//Row工厂类创建row
        }
    });

    List<StructField> structFields = new ArrayList<StructField>();
    structFields.add(DataTypes.createStructField("id",DataTypes.IntegerType,true));
    structFields.add(DataTypes.createStructField("name",DataTypes.StringType,true));
    structFields.add(DataTypes.createStructField("age",DataTypes.IntegerType,true));

    StructType structType = DataTypes.createStructType(structFields);

    DataFrame df = sqlContext.createDataFrame(javaRdd,structType);
    df.registerTempTable("persons");
    df.show();

   DataFrame dfs =  sqlContext.sql("select * from persons where age > 6");
    JavaRDD<Row> rows = dfs.javaRDD();
    List<Row> list = rows.collect();
    for(Row row : list){
        System.out.println(row);
    }
}

}

scala版本：
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.{SparkContext, SparkConf}

import scala.io.Source

/**
* Created by rong on 2016/3/20.
*/
object RDD2DataFrameByProgrammatically2 {

def main(args: Array[String]) {
val conf = new SparkConf().setMaster(“local”).setAppName(“RDD2DataFrameByProgrammatically2”)
val sc = new SparkContext(conf)

val sqlContext = new SQLContext(sc) //实际生产用HiveContext
val persons = sc.textFile("C://Users//rong//Desktop//text//persons.txt")

//通过读取文件实现schame
val lines = Source.fromFile("C://Users//rong//Desktop//text//fields.txt")
var schemaString = ""
for (elem <- lines.getLines()) {
  schemaString += elem + " "
}
val str = schemaString.split("\\s+")
val structType = StructType(str.map(fieldName => StructField(fieldName, StringType, true)))

 var rowPerson = Row()
val rowRdd = persons.map(br => {
  val ps = br.split(",")
  ps.foreach(bk => rowPerson = Row.merge(rowPerson,Row(bk)))
  rowPerson  
})

val df = sqlContext.createDataFrame(rowRdd, structType)
df.registerTempTable("persons")
df.show()
df.select("name").show()//和df.select(df.col("name")).show()一样

val results = sqlContext.sql("select * from persons where age > 6")

results.map(x => x(0) + " " + x(1)).collect().foreach(println)

}
}