1、导入maven依赖
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<java.version>1.8</java.version>
<spark.version>2.1.0</spark.version>
<scala.version>2.11</scala.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_${scala.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.30</version>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>2.11.11</version>
</dependency>
</dependencies>
2、Dataset转换为RDD
public class SparkSql {
public static void main(String[] args) {
//1、创建sparksession对象
SparkSession session = SparkSession.builder().master("local[1]").appName("Spark Sql").getOrCreate();
//2、加载数据源并转换为DataFrame对象
Dataset<Row> json = session.read().json("C:\\Users\\spectre\\Desktop\\stu");
//3、查询
//Dataset<Row> se = json.select("stuname","stuage","stusex").where("stuage>20");
//se.show();
json.createOrReplaceTempView("stus");
Dataset<Row> dataset = session.sql("select stuname,stuage,stusex from stus");
//将Dataset转换为javaRDD
List<String> collect = dataset.javaRDD().map(new Function<Row, String>() {
@Override
public String call(Row row) throws Exception {
return row.getString(0) + row.getLong(1) + row.getString(2);
}
}).collect();
for (String s:collect) {
System.out.println(s);
}
}
}
3、将RDD转换为Dataset
(反射机制)
SparkSession spark = SparkSession.builder().master("local[*]").appName("Spark").getOrCreate();
final JavaSparkContext ctx = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<String> source = spark.read().textFile("stuInfo.txt").javaRDD();
JavaRDD<Student> rowRDD = source.map(new Function<String, Student>() {
public Student call(String line) throws Exception {
String parts[] = line.split(",");
Student stu = new Student();
stu.setSid(parts[0]);
stu.setSname(parts[1]);
stu.setSage(Integer.valueOf(parts[2]));
return stu;
}
});
Dataset<Row> df = spark.createDataFrame(rowRDD, Student.class);
df.select("sid", "sname", "sage").coalesce(1).write().mode(SaveMode.Append).parquet("parquet.res");
使用schema生成方案
SparkSession spark = SparkSession.builder().master("local[*]").appName("Spark").getOrCreate();
final JavaSparkContext ctx = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<String> source = spark.read().textFile("stuInfo.txt").javaRDD();
JavaRDD<Row> rowRDD = source.map(new Function<String, Row>() {
public Row call(String line) throws Exception {
String[] parts = line.split(",");
String sid = parts[0];
String sname = parts[1];
int sage = Integer.parseInt(parts[2]);
return RowFactory.create(sid, sname, sage);
}
});
ArrayList<StructField> fields = new ArrayList<StructField>();
StructField field = null;
field = DataTypes.createStructField("sid", DataTypes.StringType, true);
fields.add(field);
field = DataTypes.createStructField("sname", DataTypes.StringType, true);
fields.add(field);
field = DataTypes.createStructField("sage", DataTypes.IntegerType, true);
fields.add(field);
StructType schema = DataTypes.createStructType(fields);
Dataset<Row> df = spark.createDataFrame(rowRDD, schema);
df.coalesce(1).write().mode(SaveMode.Append).parquet("parquet.res1");