java版本:
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import java.util.ArrayList;
import java.util.List;
/**
* Created by rong on 2016/3/19.
*/
public class RDD2DataFrameByProgrammatically {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName(“RDD2DataFrame”).setMaster(“local”);
JavaSparkContext sc= new JavaSparkContext(conf);
JavaRDD<String> lines = sc.textFile("C://Users//rong//Desktop//persons.txt");
SQLContext sqlContext = new SQLContext(sc);
JavaRDD<Row> javaRdd = lines.map(new Function<String, Row>() {
public Row call(String line) throws Exception {
String[] str = line.split(",");
return RowFactory.create(Integer.valueOf(str[0]),str[1],Integer.valueOf(str[2]));//Row工厂类创建row
}
});
List<StructField> structFields = new ArrayList<StructField>();
structFields.add(DataTypes.createStructField("id",DataTypes.IntegerType,true));
structFields.add(DataTypes.createStructField("name",DataTypes.StringType,true));
structFields.add(DataTypes.createStructField("age",DataTypes.IntegerType,true));
StructType structType = DataTypes.createStructType(structFields);
DataFrame df = sqlContext.createDataFrame(javaRdd,structType);
df.registerTempTable("persons");
df.show();
DataFrame dfs = sqlContext.sql("select * from persons where age > 6");
JavaRDD<Row> rows = dfs.javaRDD();
List<Row> list = rows.collect();
for(Row row : list){
System.out.println(row);
}
}
}
scala版本:
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.{SparkContext, SparkConf}
import scala.io.Source
/**
* Created by rong on 2016/3/20.
*/
object RDD2DataFrameByProgrammatically2 {
def main(args: Array[String]) {
val conf = new SparkConf().setMaster(“local”).setAppName(“RDD2DataFrameByProgrammatically2”)
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc) //实际生产用HiveContext
val persons = sc.textFile("C://Users//rong//Desktop//text//persons.txt")
//通过读取文件实现schame
val lines = Source.fromFile("C://Users//rong//Desktop//text//fields.txt")
var schemaString = ""
for (elem <- lines.getLines()) {
schemaString += elem + " "
}
val str = schemaString.split("\\s+")
val structType = StructType(str.map(fieldName => StructField(fieldName, StringType, true)))
var rowPerson = Row()
val rowRdd = persons.map(br => {
val ps = br.split(",")
ps.foreach(bk => rowPerson = Row.merge(rowPerson,Row(bk)))
rowPerson
})
val df = sqlContext.createDataFrame(rowRdd, structType)
df.registerTempTable("persons")
df.show()
df.select("name").show()//和df.select(df.col("name")).show()一样
val results = sqlContext.sql("select * from persons where age > 6")
results.map(x => x(0) + " " + x(1)).collect().foreach(println)
}
}