dataframe转化为array_疯狂Spark之DataFrame创建方式详解二(十)

最新推荐文章于 2024-05-16 22:05:01 发布

weixin_39691968

最新推荐文章于 2024-05-16 22:05:01 发布

阅读量735

点赞数

文章标签： dataframe转化为array sparksql dataframe变成csv保存

创建DataFrame的几种方式

1、读取parquet文件创建DataFrame

注意：

可以将DataFrame存储成parquet文件。保存成parquet文件的方式有两种

df.write().mode(SaveMode.Overwrite).format("parquet").save("./sparksql/parquet");
df.write().mode(SaveMode.Overwrite).parquet("./sparksql/parquet");

SaveMode指定文件保存时的模式，参数解释如下。

Overwrite：覆盖

Append：追加

ErrorIfExists：如果存在就报错

Ignore：如果存在就忽略

代码演示

方式一：使用format方法

package com.gw.sparksql
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.SaveMode
object Parquet {
 def main(args: Array[String]): Unit = {
 val conf = new SparkConf().setMaster("local").setAppName("parquet")
 val sc = new SparkContext(conf)
 val sqlContext = new SQLContext(sc)
    //读取json文件,读取出来的文件是RDD形式
 val jsonRDD = sc.textFile("sparksql/json")
    //将RDD转换成DataFrame形式
 val df = sqlContext.read.json(jsonRDD)
    //将DF保存为parquet文件
    df.write.mode(SaveMode.Overwrite).format("parquet").save("./sparksql/parquet")
    //读取parquet文件
 var result = sqlContext.read.parquet("./sparksql/parquet")
    result.show()
    sc.stop()
  }
}

运行结果

此时刷新项目，在项目的sparksql目录下会多出一个目录，如下图

方式二：使用parquet方法

package com.gw.sparksql
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.SaveMode
object Parquet {
 def main(args: Array[String]): Unit = {
 val conf = new SparkConf().setMaster("local").setAppName("parquet")
 val sc = new SparkContext(conf)
 val sqlContext = new SQLContext(sc)
    //读取json文件,读取出来的文件是RDD形式
 val jsonRDD = sc.textFile("sparksql/json")
    //将RDD转换成DataFrame形式
 val df = sqlContext.read.json(jsonRDD)
    //将DF保存为parquet文件
     df.write.mode(SaveMode.Overwrite).parquet("./sparksql/parquet")
    //读取parquet文件
 var result = sqlContext.read.format("parquet").load("./sparksql/parquet")
    result.show()
    sc.stop()
  }
}

2、读取JDBC中的数据创建DataFrame(MySql为例)

在mysql数据库中新建一个库：spark,在该库中新建两张数据表:score，person

创建库语句：create database spark default charset utf8;

使用数据库：use spark

创建数据库表语句

create table score(
    id int primary key auto_increment,
    name varchar(20),
    score int
)engine=innodb default charset utf8;
create table person(
    id int primary key auto_increment,
    name varchar(20),
    age int
    )engine=innodb default charset utf8;

添加信息

insert into score values (1,'张三',98);

insert into score values (2,'李四',78);

insert into score values (3,'王五',68);

insert into score values (4,'赵六',88);

insert into person values (1,'张三',23);

insert into person values (2,'李四',33);

insert into person values (3,'王五',25);

insert into person values (4,'赵六',26);

在项目中添加连接数据库的jar包

代码演示

需求：连接数据库，读取数据库中数据

方式一：

package com.gw.sparksql
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import java.util.HashMap
object Mysql {
 def main(args: Array[String]): Unit = {
 val conf = new SparkConf().setMaster("local").setAppName("mysql")
 val sc = new SparkContext(conf)
 val sqlContext = new SQLContext(sc)
 	  //第一种方式读取Mysql数据库表创建DF
    //获取数据库连接信息
 val options = new HashMap[String,String]();
		options.put("url", "jdbc:mysql://localhost:3306/spark")
		options.put("driver","com.mysql.jdbc.Driver")
		options.put("user","root")
		options.put("password", "root")
		options.put("dbtable","person")
 val person = sqlContext.read.format("jdbc").options(options).load()
		person.show()
  }
}

运行结果

方式二：

package com.gw.sparksql
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
object Mysql {
 def main(args: Array[String]): Unit = {
 val conf = new SparkConf().setMaster("local").setAppName("mysql")
 val sc = new SparkContext(conf)
 val sqlContext = new SQLContext(sc)
 	  //第二种方式读取Mysql数据库表创建DF
    //获取数据库连接信息
 val reader = sqlContext.read.format("jdbc")
		reader.option("url", "jdbc:mysql://localhost:3306/spark")
		reader.option("driver","com.mysql.jdbc.Driver")
		reader.option("user","root")
		reader.option("password","root")
		reader.option("dbtable", "score")
 val score = reader.load()
		score.show()
  }
}

运行结果

两张表连接查询

package com.gw.sparksql
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import java.util.HashMap
object Mysql {
 def main(args: Array[String]): Unit = {
 val conf = new SparkConf().setMaster("local").setAppName("mysql")
 val sc = new SparkContext(conf)
 val sqlContext = new SQLContext(sc)
 	  /**
		 * 第一种方式读取Mysql数据库表创建DF
		 */
 val options = new HashMap[String,String]();
		options.put("url", "jdbc:mysql://localhost:3306/spark")
		options.put("driver","com.mysql.jdbc.Driver")
		options.put("user","root")
		options.put("password", "root")
		options.put("dbtable","person")
 val person = sqlContext.read.format("jdbc").options(options).load()
		person.registerTempTable("person")
		/**
		 * 第二种方式读取Mysql数据库表创建DF
		 */
 val reader = sqlContext.read.format("jdbc")
		reader.option("url", "jdbc:mysql://localhost:3306/spark")
		reader.option("driver","com.mysql.jdbc.Driver")
		reader.option("user","root")
		reader.option("password","root")
		reader.option("dbtable", "score")
 val score = reader.load()
		score.registerTempTable("score")
		//连接查询
 val result = sqlContext.sql("select person.id,person.age,score.name,score.score from person,score where person.name = score.name")
		result.show()
  }
}

运行结果

向数据库中添加数据

需求：将查询出来的数据添加到一张result表中

package com.gw.sparksql
import java.util.HashMap
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.SaveMode
import java.util.Properties
object Mysql {
 def main(args: Array[String]): Unit = {
 val conf = new SparkConf().setMaster("local").setAppName("mysql")
 val sc = new SparkContext(conf)
 val sqlContext = new SQLContext(sc)
 	  /**
		 * 第一种方式读取Mysql数据库表创建DF
		 */
 val options = new HashMap[String,String]();
		options.put("url", "jdbc:mysql://localhost:3306/spark")
		options.put("driver","com.mysql.jdbc.Driver")
		options.put("user","root")
		options.put("password", "root")
		options.put("dbtable","person")
 val person = sqlContext.read.format("jdbc").options(options).load()
		person.registerTempTable("person")
		/**
		 * 第二种方式读取Mysql数据库表创建DF
		 */
 val reader = sqlContext.read.format("jdbc")
		reader.option("url", "jdbc:mysql://localhost:3306/spark")
		reader.option("driver","com.mysql.jdbc.Driver")
		reader.option("user","root")
		reader.option("password","root")
		reader.option("dbtable", "score")
 val score = reader.load()
		score.registerTempTable("score")
		//连接查询
 val result = sqlContext.sql("select person.id,person.age,score.name,score.score from person,score where person.name = score.name")
		result.show()
		/**
		 * 将数据写入到Mysql表中
		 */
 val properties = new Properties()
		properties.setProperty("user", "root")
		properties.setProperty("password", "root")
		result.write.mode(SaveMode.Append).jdbc("jdbc:mysql://localhost:3306/spark", "result", properties)
		sc.stop()
  }
}

此时在数据库中查询，发现多出一张result表，表中存储的是刚刚连接查询出来的数据

3、读取Hive中的数据加载成DataFrame

HiveContext是SQLContext的子类，连接Hive建议使用HiveContext。

由于本地没有Hive环境，要提交到集群运行，提交命令：

./spark-submit  --master spark://node01:7077,node02:7077 
--executor-cores 1  --executor-memory 2G  --total-executor-cores 1
--class com.gw.sparksql.dataframe.CreateDFFromHive  /root/test/HiveTest.jar

代码演示

package com.gw.sparksql
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.hive.HiveContext
import org.apache.hadoop.hive.conf.HiveConf
import org.apache.spark.sql.SaveMode
object Hive {
 def main(args: Array[String]): Unit = {
 val conf = new SparkConf().setAppName("HiveSource")
 val sc = new SparkContext(conf)
    /**
     * HiveContext是SQLContext的子类。
     */
 val hiveContext = new HiveContext(sc)
    hiveContext.sql("use spark")
    hiveContext.sql("drop table if exists student_infos")
    hiveContext.sql("create table if not exists student_infos (name string,age int) row format  delimited fields terminated by 't'")
    hiveContext.sql("load data local inpath '/root/test/student_infos' into table student_infos")
 
    hiveContext.sql("drop table if exists student_scores")
    hiveContext.sql("create table if not exists student_scores (name string,score int) row format delimited fields terminated by 't'")
    hiveContext.sql("load data local inpath '/root/test/student_scores' into table student_scores")
 
 val df = hiveContext.sql("select si.name,si.age,ss.score from student_infos si,student_scores ss where si.name = ss.name")
    hiveContext.sql("drop table if exists good_student_infos")
    /**
     * 将结果写入到hive表中
     */
    df.write.mode(SaveMode.Overwrite).saveAsTable("good_student_infos")
    sc.stop()
  }
}