spark sql demo

最新推荐文章于 2023-11-22 17:54:26 发布

年青人阿本

最新推荐文章于 2023-11-22 17:54:26 发布

阅读量660

点赞数

分类专栏： spark 文章标签： spark sql

本文链接：https://blog.csdn.net/liben2007/article/details/59528788

版权

spark 专栏收录该内容

11 篇文章 0 订阅

订阅专栏

通过pojo构造table：

package sparkSql;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
import spark.Person;


public class DataFrameDemo {

    public static void main(String[] args) {
        SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("test");
        JavaSparkContext sc = new JavaSparkContext(conf);
        SQLContext sqlContext = new SQLContext(sc);

        JavaRDD<Person> people = sc.textFile("D:\\WorkSpace\\Distributed_framework\\spark-1.5.1\\spark-1.5.1\\examples/src/main/resources/people.txt").map(
                new Function<String, Person>() {
                    public Person call(String line) throws Exception {
                        String[] parts = line.split(",");

                        Person person = new Person();
                        person.setName(parts[0]);
                        person.setAge(Integer.parseInt(parts[1].trim()));

                        return person;
                    }
                });
        DataFrame schemaPeople = sqlContext.createDataFrame(people, Person.class);
//        DataFrame schemaPeople = sqlContext.read().json("D:\\WorkSpace\\Distributed_framework\\spark-1.5.1\\spark-1.5.1\\examples/src/main/resources/people.json");

        schemaPeople.show();
        schemaPeople.printSchema();
        schemaPeople.select("name").show();
        schemaPeople.select(schemaPeople.col("name"),schemaPeople.col("age").plus(1)).show();

        schemaPeople.registerTempTable("people");

        DataFrame teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19");
        teenagers.show();

    }

    public static void readJson(SQLContext sqlContext) {
        DataFrame df = sqlContext.read().json("D:\\WorkSpace\\Distributed_framework\\spark-1.5.1\\spark-1.5.1\\examples/src/main/resources/people.json");
        df.show();
        df.printSchema();
        df.select("name").show();
    }
}

public class Person implements Serializable {
    private String name;
    private int age;

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public int getAge() {
        return age;
    }

    public void setAge(int age) {
        this.age = age;
    }
}

schema方式构造table，无需预先创建pojo类：

package sparkSql;


import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;

import java.util.ArrayList;
import java.util.List;

/**
 * Created by 奔荣 on 2017/2/25.
 */
public class SchemaSparkSql {
    public static void main(String[] args) {
        // sc is an existing JavaSparkContext.
        SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("test");
        JavaSparkContext sc = new JavaSparkContext(conf);
        SQLContext sqlContext = new SQLContext(sc);

        String inPath = "D:\\WorkSpace\\Distributed_framework\\spark-1.5.1\\spark-1.5.1\\examples/src/main/resources";

        /**
         * schema方式执行sql
         */

        String file = "D:\\WorkSpace\\Distributed_framework\\spark-1.5.1\\spark-1.5.1\\examples/src/main/resources/people.txt";
        JavaRDD<String> people = sc.textFile(file);

        // Generate the schema based on the string of schema
        List<StructField> fields = new ArrayList<StructField>();
        fields.add(DataTypes.createStructField("name", DataTypes.StringType, true));
        fields.add(DataTypes.createStructField("age", DataTypes.StringType, true));
        StructType schema = DataTypes.createStructType(fields);

        // Convert records of the RDD (people) to Rows.
        JavaRDD<Row> rowRDD = people.map(
                new Function<String, Row>() {
                    public Row call(String record) throws Exception {
                        String[] fields = record.split(",");
                        return RowFactory.create(fields[0], fields[1].trim());
                    }
                });

        // Apply the schema to the RDD.
        DataFrame peopleDataFrame = sqlContext.createDataFrame(rowRDD, schema);
        peopleDataFrame.registerTempTable("people");
        DataFrame results = sqlContext.sql("SELECT * FROM people where age >= 19 ");
        results.show();
        // The results of SQL queries are DataFrames and support all the normal RDD operations.
        // The columns of a row in the result can be accessed by ordinal.
        List<String> names = results.javaRDD().map(new Function<Row, String>() {
            public String call(Row row) {
                return "Name: " + row.getString(0);
            }
        }).collect();
        System.out.println(names);
    }


}

DataFrame 加载保存示例：

public static void main(String[] args) {

        SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("test");
        JavaSparkContext sc = new JavaSparkContext(conf);
        SQLContext sqlContext = new SQLContext(sc);

        String inPath = "D:\\WorkSpace\\Distributed_framework\\spark-1.5.1\\spark-1.5.1\\examples/src/main/resources";
        String outPath = "G:\\data\\spark\\sql\\out";

        String parquetFile = inPath + "/users.parquet";

        /**
         * 加载保存示例
         */

        //默认以parquet格式加载
        DataFrame df = sqlContext.read().load(parquetFile);
        df.show();
        df.write().save(outPath+ "namesAndFavColors.parquet");

        //以指定格式加载和保存
        DataFrame dfJson = sqlContext.read().format("json").load(inPath+"/people.json");
        dfJson.select("name", "age").write().format("parquet").save(outPath+"namesAndAges.parquet");
        dfJson.select("name", "age").write().format("json").save(outPath+"namesAndAges.json");

    }