import org.apache.spark.sql.{SaveMode, SparkSession}
import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}
object taxi_csv {
def main(args: Array[String]): Unit = {
val csv_path = "datas/taxi.csv"
val spark = SparkSession.builder()
.master("local[*]")
.appName("csv")
.getOrCreate()
val schema = StructType(Array(
StructField("tid",LongType),
StructField("lan",StringType),
StructField("len",StringType),
StructField("time",StringType)
))
spark.sqlContext.read
.format("csv")
.option("header",false)
.schema(schema)
.load(csv_path)
.createOrReplaceTempView("taxi")
val taxiDF = spark.sqlContext.read
.format("csv")
.option("header",false)
.schema(schema)
.load(csv_path)
val hourDF = taxiDF.select(taxiDF.col("tid"),taxiDF.col("time").substr(0,2).as("hour"))
spark.sql(
"""
|SELECT tid,SUBSTR(time,0,2) AS hour
|FROM taxi
""".stripMargin).createOrReplaceTempView("taxi_hour")
val hour_groupDF = hourDF.groupBy("tid","hour").count()
//对应下面的sql
spark.sql(
"""
|SELECT tid,hour,count(1) AS count
|FROM taxi_hour
|GROUP BY tid,hour
""".stripMargin).createOrReplaceTempView("taxi_hour_count")
hour_groupDF.createOrReplaceTempView("taxi_hour_count")
spark.sql(
"""
|SELECT tid,hour,count,
|ROW_NUMBER() OVER(PARTITION BY hour ORDER BY count DESC) AS rnk
|from taxi_hour_count
""".stripMargin).createOrReplaceTempView("taxi_hour_count_rnk")
val taxi_hour_count_rnkDF = spark.sql(
"""
|SELECT tid,hour,count,
|ROW_NUMBER() OVER(PARTITION BY hour ORDER BY count DESC) AS rnk
|from taxi_hour_count
""".stripMargin)
val result = taxi_hour_count_rnkDF.filter(taxi_hour_count_rnkDF.col("rnk") <= 5)
result.coalesce(3).write
.format("csv")
.option("header",true)
//若不注释,则最后都会为12个分区文件,无论前面是怎么分区的
.partitionBy("hour")
.mode(SaveMode.Overwrite)
.save("/out_datas/taxi_groupbyhour")
}
相关api的使用方法,如下,及rdd向dataframe的转换
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SparkSession, types}
/**
* DataFrame和RDD的互操作
*/
object DataFrameRDDApp {
def main(args: Array[String]) {
val spark = SparkSession.builder().appName("DataFrameRDDApp").master("local[2]").getOrCreate()
inferReflection(spark)
// program(spark)
spark.stop()
}
def program(spark: SparkSession): Unit = {
// RDD ==> DataFrame
//编程转换,
val rdd = spark.sparkContext.textFile("datas/student.txt")
val infoRDD = rdd.map(_.split("\\|")).map(line => Row(line(0).toInt, line(1), line(2), line(3)))
val structType = StructType(Array(StructField("id", IntegerType, true),
StructField("name", StringType, true),
StructField("phone", StringType, true),
StructField("email", StringType, true)))
val infoDF = spark.createDataFrame(infoRDD,structType)
infoDF.printSchema()
infoDF.show()
//通过df的api进行操作
infoDF.filter(infoDF.col("id") > 5).show
infoDF.filter(infoDF.col("name").startsWith("a")).show
//通过sql的方式进行操作
infoDF.createOrReplaceTempView("infos")
spark.sql("select * from infos where id > 30").show()
}
def inferReflection(spark: SparkSession) {
// RDD ==> DataFrame
val rdd = spark.sparkContext.textFile("datas/student.txt")
//注意:需要导入隐式转换
import spark.implicits._
val studentDF = rdd.map(_.split("\\|")).map(x => Student(x(0).toInt, x(1), x(2), x(3))).toDF()
studentDF.show()
studentDF.filter("substr(name,0,1) = 'a'").show()
studentDF.filter(studentDF.col("id") > 6).show
studentDF.filter(studentDF.col("name").startsWith("a").as("re_name"))
studentDF.createOrReplaceTempView("student")
spark.sql("select * from student where id > 18").show()
}
case class Student(id: Int, name: String, phone: String, email: String)
}
问题:在将数据保存到本地计算机上时,有一个报错 HADOOP_HOME and hadoop.home.dir are unset
解决: 解压一个hadoop目录到本地文件夹,然后将winutils.exe这个软件放进bin目录,并设置本地环境变量,HADOOP_HOME,然后重启idea,即可继续撸代码。
val mysqlDF = spark.read.format("jdbc")
.option("url", "jdbc:mysql://master:3306")
.option("dbtable", "employees.departments")
.option("user", "root")
.option("password", "123456")
//添加jar包到jars 文件夹下,不然会报class not found 的错误
.option("driver", "com.mysql.jdbc.Driver")
.load()
通过spark 读取外部mysql数据源,参数配置,