package com.fosun.sparkdemo; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.SQLContext; import org.apache.spark.sql.functions; import org.apache.spark.sql.api.java.UDF1; import org.apache.spark.sql.types.DataTypes; public class DataFrameJson { public static void main(String[] args) { JavaSparkContext sc =new JavaSparkContext((new SparkConf()).setAppName("Json").setMaster("local")); SQLContext sqlContext = new SQLContext(sc); //自定义UDF函数 // sqlContext.udf().register("customF", new UDF1<String,String>(){ // // private static final long serialVersionUID = 1L; // // @Override // public String call(String t1) throws Exception { // return "cast"+t1; // } // // }, DataTypes.StringType); //使用json读取文件 // DataFrame df = sqlContext.read().json("D:/workspace/work2/sparkdemo/src/main/java/com/fosun/sparkdemo/resource/people.json"); //将DataFrame注册为一张表 // df.registerTempTable("people"); // //查询表中的字段,并将age转化为int型 // df = df.select(df.col("name"),df.col("age").cast(DataTypes.IntegerType)); //使用自定义函数 // DataFrame df2 = sqlContext.sql("select customF(name) from people"); //去重 // df2 = df2.distinct(); //展示DataFrame // df2.show(); //过滤,类似于sql中的过滤条件与where相同,col和apply相同都是获取列 // df = df.filter(df.col("age").$eq$eq$eq(15).and(df.col("name").$eq$eq$eq("lisi"))); // df = df.filter(df.col("age").$greater(14).$amp$amp(df.col("age").$less(16))); // df = df.filter(df.apply("age").$greater(14).$amp$amp(df.apply("age").$less(16)));//df.apply(column)==df.col(column) // df = df.filter("age = 15"); // df = df.filter("name = 'lisi'"); // df = df.filter("age = 15 && name = 'lisi'");//error 不能使用 && // df = df.where("age = 15").where("name = 'lisi'"); // df.filter(df.col("age").gt(13)).show(); //新增一列,并用其他列赋值 // df = df.withColumn("subAge", df.col("age").$plus(10));//添加一列 //新增一列,常数值列 // df = df.withColumn("newColumn", functions.lit(11));//添加一列 //给列名重命名 // df = df.withColumnRenamed("subAge","newAge"); // df = df.groupBy(df.col("name")).count();//添加新的字段count,然后以name来分组 //agg使用聚合函数,由于java版缺乏函数式编程的支持,故使用org.apache.spark.sql.functions中的常量方法 // df = df.groupBy("name").agg(functions.avg(df.col("age")).alias("avg"));//并且取个别名 // df = df.sort(df.col("name").asc(),df.col("age").desc());//排序 //将DataFrame保存为parquet文件 // df.write().parquet("D:/workspace/work2/sparkdemo/src/main/java/com/fosun/sparkdemo/resource/people.parquet"); // //读取parquet文件 // DataFrame df2 = sqlContext.read().parquet("D:/workspace/work2/sparkdemo/src/main/java/com/fosun/sparkdemo/resource/people.parquet"); //展示DataFrame的schema,和SQL中的Schema类似 // df2.printSchema(); // df.show(); // df.printSchema(); //查看unix时间戳,java中unix_timestamp是ms,sparkSQL中是ms // DataFrame df = sqlContext.sql("select unix_timestamp() as ts"); //单位是s // df.show();//1492397384 //有关unix时间戳的格式转化 // DataFrame df = sqlContext.sql("select from_unixtime(unix_timestamp(),'yyyy-MM-dd HH:mm:ss') as datetime"); //单位是s // df.show();//2017-04-17 10:54:36 sc.stop(); } }
项目的pom.xml文件
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.fosun</groupId> <artifactId>sparkdemo</artifactId> <version>0.0.1-SNAPSHOT</version> <packaging>jar</packaging> <name>sparkdemo</name> <url>http://maven.apache.org</url> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> </properties> <dependencies> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_2.10</artifactId> <version>1.6.2</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-sql_2.10</artifactId> <version>1.6.2</version> </dependency> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>3.8.1</version> <scope>test</scope> </dependency> </dependencies> <build> <plugins> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-compiler-plugin</artifactId> <version>3.5.1</version> <configuration> <source>1.8</source> <target>1.8</target> <encoding>UTF-8</encoding> </configuration> </plugin> </plugins> </build> </project>
项目中使用到的json文件
[{"name":"zhangsan","age":"12"},{"name":"lisi","age":"15"},{"name":"lisi","age":"18"}]
SparkSQL中DataFrame常用API
最新推荐文章于 2024-06-30 01:49:55 发布