通过pojo构造table:
package sparkSql;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
import spark.Person;
public class DataFrameDemo {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("test");
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(sc);
JavaRDD<Person> people = sc.textFile("D:\\WorkSpace\\Distributed_framework\\spark-1.5.1\\spark-1.5.1\\examples/src/main/resources/people.txt").map(
new Function<String, Person>() {
public Person call(String line) throws Exception {
String[] parts = line.split(",");
Person person = new Person();
person.setName(parts[0]);
person.setAge(Integer.parseInt(parts[1].trim()));
return person;
}
});
DataFrame schemaPeople = sqlContext.createDataFrame(people, Person.class);
// DataFrame schemaPeople = sqlContext.read().json("D:\\WorkSpace\\Distributed_framework\\spark-1.5.1\\spark-1.5.1\\examples/src/main/resources/people.json");
schemaPeople.show();
schemaPeople.printSchema();
schemaPeople.select("name").show();
schemaPeople.select(schemaPeople.col("name"),schemaPeople.col("age").plus(1)).show();
schemaPeople.registerTempTable("people");
DataFrame teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19");
teenagers.show();
}
public static void readJson(SQLContext sqlContext) {
DataFrame df = sqlContext.read().json("D:\\WorkSpace\\Distributed_framework\\spark-1.5.1\\spark-1.5.1\\examples/src/main/resources/people.json");
df.show();
df.printSchema();
df.select("name").show();
}
}
public class Person implements Serializable {
private String name;
private int age;
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public int getAge() {
return age;
}
public void setAge(int age) {
this.age = age;
}
}
schema方式构造table,无需预先创建pojo类:
package sparkSql;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import java.util.ArrayList;
import java.util.List;
/**
* Created by 奔荣 on 2017/2/25.
*/
public class SchemaSparkSql {
public static void main(String[] args) {
// sc is an existing JavaSparkContext.
SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("test");
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(sc);
String inPath = "D:\\WorkSpace\\Distributed_framework\\spark-1.5.1\\spark-1.5.1\\examples/src/main/resources";
/**
* schema方式执行sql
*/
String file = "D:\\WorkSpace\\Distributed_framework\\spark-1.5.1\\spark-1.5.1\\examples/src/main/resources/people.txt";
JavaRDD<String> people = sc.textFile(file);
// Generate the schema based on the string of schema
List<StructField> fields = new ArrayList<StructField>();
fields.add(DataTypes.createStructField("name", DataTypes.StringType, true));
fields.add(DataTypes.createStructField("age", DataTypes.StringType, true));
StructType schema = DataTypes.createStructType(fields);
// Convert records of the RDD (people) to Rows.
JavaRDD<Row> rowRDD = people.map(
new Function<String, Row>() {
public Row call(String record) throws Exception {
String[] fields = record.split(",");
return RowFactory.create(fields[0], fields[1].trim());
}
});
// Apply the schema to the RDD.
DataFrame peopleDataFrame = sqlContext.createDataFrame(rowRDD, schema);
peopleDataFrame.registerTempTable("people");
DataFrame results = sqlContext.sql("SELECT * FROM people where age >= 19 ");
results.show();
// The results of SQL queries are DataFrames and support all the normal RDD operations.
// The columns of a row in the result can be accessed by ordinal.
List<String> names = results.javaRDD().map(new Function<Row, String>() {
public String call(Row row) {
return "Name: " + row.getString(0);
}
}).collect();
System.out.println(names);
}
}
DataFrame 加载保存示例:
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("test");
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(sc);
String inPath = "D:\\WorkSpace\\Distributed_framework\\spark-1.5.1\\spark-1.5.1\\examples/src/main/resources";
String outPath = "G:\\data\\spark\\sql\\out";
String parquetFile = inPath + "/users.parquet";
/**
* 加载保存示例
*/
//默认以parquet格式加载
DataFrame df = sqlContext.read().load(parquetFile);
df.show();
df.write().save(outPath+ "namesAndFavColors.parquet");
//以指定格式加载和保存
DataFrame dfJson = sqlContext.read().format("json").load(inPath+"/people.json");
dfJson.select("name", "age").write().format("parquet").save(outPath+"namesAndAges.parquet");
dfJson.select("name", "age").write().format("json").save(outPath+"namesAndAges.json");
}