spark sql demo

通过pojo构造table:

package sparkSql;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
import spark.Person;


public class DataFrameDemo {

    public static void main(String[] args) {
        SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("test");
        JavaSparkContext sc = new JavaSparkContext(conf);
        SQLContext sqlContext = new SQLContext(sc);

        JavaRDD<Person> people = sc.textFile("D:\\WorkSpace\\Distributed_framework\\spark-1.5.1\\spark-1.5.1\\examples/src/main/resources/people.txt").map(
                new Function<String, Person>() {
                    public Person call(String line) throws Exception {
                        String[] parts = line.split(",");

                        Person person = new Person();
                        person.setName(parts[0]);
                        person.setAge(Integer.parseInt(parts[1].trim()));

                        return person;
                    }
                });
        DataFrame schemaPeople = sqlContext.createDataFrame(people, Person.class);
//        DataFrame schemaPeople = sqlContext.read().json("D:\\WorkSpace\\Distributed_framework\\spark-1.5.1\\spark-1.5.1\\examples/src/main/resources/people.json");

        schemaPeople.show();
        schemaPeople.printSchema();
        schemaPeople.select("name").show();
        schemaPeople.select(schemaPeople.col("name"),schemaPeople.col("age").plus(1)).show();

        schemaPeople.registerTempTable("people");

        DataFrame teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19");
        teenagers.show();

    }

    public static void readJson(SQLContext sqlContext) {
        DataFrame df = sqlContext.read().json("D:\\WorkSpace\\Distributed_framework\\spark-1.5.1\\spark-1.5.1\\examples/src/main/resources/people.json");
        df.show();
        df.printSchema();
        df.select("name").show();
    }
}

public class Person implements Serializable {
    private String name;
    private int age;

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public int getAge() {
        return age;
    }

    public void setAge(int age) {
        this.age = age;
    }
}




schema方式构造table,无需预先创建pojo类:

package sparkSql;


import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;

import java.util.ArrayList;
import java.util.List;

/**
 * Created by 奔荣 on 2017/2/25.
 */
public class SchemaSparkSql {
    public static void main(String[] args) {
        // sc is an existing JavaSparkContext.
        SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("test");
        JavaSparkContext sc = new JavaSparkContext(conf);
        SQLContext sqlContext = new SQLContext(sc);

        String inPath = "D:\\WorkSpace\\Distributed_framework\\spark-1.5.1\\spark-1.5.1\\examples/src/main/resources";

        /**
         * schema方式执行sql
         */

        String file = "D:\\WorkSpace\\Distributed_framework\\spark-1.5.1\\spark-1.5.1\\examples/src/main/resources/people.txt";
        JavaRDD<String> people = sc.textFile(file);

        // Generate the schema based on the string of schema
        List<StructField> fields = new ArrayList<StructField>();
        fields.add(DataTypes.createStructField("name", DataTypes.StringType, true));
        fields.add(DataTypes.createStructField("age", DataTypes.StringType, true));
        StructType schema = DataTypes.createStructType(fields);

        // Convert records of the RDD (people) to Rows.
        JavaRDD<Row> rowRDD = people.map(
                new Function<String, Row>() {
                    public Row call(String record) throws Exception {
                        String[] fields = record.split(",");
                        return RowFactory.create(fields[0], fields[1].trim());
                    }
                });

        // Apply the schema to the RDD.
        DataFrame peopleDataFrame = sqlContext.createDataFrame(rowRDD, schema);
        peopleDataFrame.registerTempTable("people");
        DataFrame results = sqlContext.sql("SELECT * FROM people where age >= 19 ");
        results.show();
        // The results of SQL queries are DataFrames and support all the normal RDD operations.
        // The columns of a row in the result can be accessed by ordinal.
        List<String> names = results.javaRDD().map(new Function<Row, String>() {
            public String call(Row row) {
                return "Name: " + row.getString(0);
            }
        }).collect();
        System.out.println(names);
    }


}

DataFrame 加载保存示例:

public static void main(String[] args) {

        SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("test");
        JavaSparkContext sc = new JavaSparkContext(conf);
        SQLContext sqlContext = new SQLContext(sc);

        String inPath = "D:\\WorkSpace\\Distributed_framework\\spark-1.5.1\\spark-1.5.1\\examples/src/main/resources";
        String outPath = "G:\\data\\spark\\sql\\out";

        String parquetFile = inPath + "/users.parquet";

        /**
         * 加载保存示例
         */

        //默认以parquet格式加载
        DataFrame df = sqlContext.read().load(parquetFile);
        df.show();
        df.write().save(outPath+ "namesAndFavColors.parquet");

        //以指定格式加载和保存
        DataFrame dfJson = sqlContext.read().format("json").load(inPath+"/people.json");
        dfJson.select("name", "age").write().format("parquet").save(outPath+"namesAndAges.parquet");
        dfJson.select("name", "age").write().format("json").save(outPath+"namesAndAges.json");

    }



  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值