标记一个DEMO以便于后面复习
DEMO背景阿里天池竞赛系列 口碑商家客流量预测
第一步取样
获取训练数据
package com.huadian.bigdata.ijcai
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
object IJCAIUserSpark {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder()
.appName("SparkSQLDemo")
.master("local[5]")
.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
import spark.implicits._
/**
* 加载用户浏览行为数据
* user_id、shop_id、time_stamp
* 2016.10.18到2016.10.31
*/
val schema = StructType(
StructField("user_id",IntegerType,true)
::StructField("shop_id",IntegerType,true)
::StructField("time_stamp",StringType,true)
::Nil
)
val userViewDF = spark.read
.schema(schema)
.csv("file:///F:/IJCAI/user_view")
println(s"user view Count = ${userViewDF.count}") //10,106,644
userViewDF.printSchema()
userViewDF.show(5,false)
/**
* 使用SQL过滤数据,得到训练数据
* 将2016.10.11-2016.10.31及2016.9.20-2016.9.26共计28天作为训练时间段
*/
userViewDF.createOrReplaceTempView("view_tmo_user_view")
val trainUserViewDF = spark.sql(
"""
|select
| user_id ,shop_id,time_stamp
|from
| view_tmo_user_view
|where
| time_stamp between '2016-10-11 00:00:00' and '2016-11-01 00:00:00'
|or
| time_stamp between '2016-09-20 00:00:00' and '2016-09-27 00:00:00'
""".stripMargin)
println(s"train User View count${trainUserViewDF.count()}") //928578
/**
* withReplacement :Sample with replacement or not.
* 元素可以多吃抽样,是否放回
* fraction: Fraction of rows to generate.
* 期望的样本,拿多少出来
* seed :Seed for sampling.
* 随机数生成器的种子
*/
// trainUserViewDF
// .sample(false,0.3,6000L)
// .show(30)
/**
* 将数据保存为CSV格式数据
*/
trainUserViewDF
.coalesce(1)
.write
.mode(SaveMode.Overwrite)
.option("header","true")
.csv("file:///F:/IJCAI/train_user_view")
/**
* 加载用户 支付 行为数据
*/
val userPayDF = spark
.read
.schema(schema)
.csv("file:///F:/IJCAI/user_pay.txt")
println(s"user pay count ${userPayDF.count()}") //69674110
val trainUserPayDF = userPayDF.where(
"""
| time_stamp between '2016-10-11 00:00:00' and '2016-11-01 00:00:00'
|or
| time_stamp between '2016-09-20 00:00:00' and '2016-09-27 00:00:00'
""".stripMargin
).sample(false,0.3,6000L)
trainUserPayDF
.coalesce(1)
.write
.mode(SaveMode.Overwrite)
.option("header","true")
.csv("file:///F:/IJCAI/train_user_pay")
/**
* 使用SQL 获取 测试 用户支付行为数据
*/
spark.close()
}
}