【Spark】MLlib mark a demo（一）

最新推荐文章于 2020-08-17 10:41:46 发布

孙文旭

最新推荐文章于 2020-08-17 10:41:46 发布

阅读量274

点赞数 1

分类专栏： Spark

本文链接：https://blog.csdn.net/qq_35495339/article/details/98997866

版权

Spark 专栏收录该内容

10 篇文章 1 订阅

订阅专栏

标记一个DEMO以便于后面复习

DEMO背景阿里天池竞赛系列口碑商家客流量预测

第一步取样
获取训练数据

package com.huadian.bigdata.ijcai

import org.apache.spark.sql.{SaveMode, SparkSession}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}

object IJCAIUserSpark {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder()
      .appName("SparkSQLDemo")
      .master("local[5]")
      .getOrCreate()

    spark.sparkContext.setLogLevel("WARN")
    import spark.implicits._

    /**
      * 加载用户浏览行为数据
      * user_id、shop_id、time_stamp
      * 2016.10.18到2016.10.31
      */
    val schema = StructType(
      StructField("user_id",IntegerType,true)
        ::StructField("shop_id",IntegerType,true)
        ::StructField("time_stamp",StringType,true)
        ::Nil
    )
    val userViewDF = spark.read
      .schema(schema)
      .csv("file:///F:/IJCAI/user_view")

    println(s"user view Count = ${userViewDF.count}") //10,106,644
    userViewDF.printSchema()
    userViewDF.show(5,false)

    /**
      * 使用SQL过滤数据，得到训练数据
      * 将2016.10.11-2016.10.31及2016.9.20-2016.9.26共计28天作为训练时间段
      */
    userViewDF.createOrReplaceTempView("view_tmo_user_view")
    val trainUserViewDF = spark.sql(
      """
        |select
        |   user_id ,shop_id,time_stamp
        |from
        |   view_tmo_user_view
        |where
        |   time_stamp between '2016-10-11 00:00:00' and  '2016-11-01 00:00:00'
        |or
        |   time_stamp between '2016-09-20 00:00:00' and  '2016-09-27 00:00:00'
      """.stripMargin)
    
    println(s"train User View count${trainUserViewDF.count()}") //928578

    /**
      * withReplacement :Sample with replacement or not.
      *     元素可以多吃抽样，是否放回
      * fraction： Fraction of rows to generate.
      *     期望的样本，拿多少出来
      * seed ：Seed for sampling.
      *   随机数生成器的种子
      */
//    trainUserViewDF
//      .sample(false,0.3,6000L)
//      .show(30)
    /**
      * 将数据保存为CSV格式数据
      */
    trainUserViewDF
      .coalesce(1)
      .write
      .mode(SaveMode.Overwrite)
      .option("header","true")
      .csv("file:///F:/IJCAI/train_user_view")

    /**
      * 加载用户 支付 行为数据
      */
    val userPayDF = spark
      .read
      .schema(schema)
      .csv("file:///F:/IJCAI/user_pay.txt")

    println(s"user pay count ${userPayDF.count()}") //69674110

    val trainUserPayDF = userPayDF.where(
      """
        |   time_stamp between '2016-10-11 00:00:00' and  '2016-11-01 00:00:00'
        |or
        |   time_stamp between '2016-09-20 00:00:00' and  '2016-09-27 00:00:00'
      """.stripMargin
    ).sample(false,0.3,6000L)

    trainUserPayDF
      .coalesce(1)
      .write
      .mode(SaveMode.Overwrite)
      .option("header","true")
      .csv("file:///F:/IJCAI/train_user_pay")

    /**
      * 使用SQL 获取 测试  用户支付行为数据
      */
    spark.close()

  }
}

孙文旭

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
【Spark】MLlib mark a demo（一）

标记一个DEMO以便于后面复习DEMO背景阿里天池竞赛系列口碑商家客流量预测第一步取样获取训练数据package com.huadian.bigdata.ijcaiimport org.apache.spark.sql.{SaveMode, SparkSession}import org.apache.spark.sql.types.{IntegerType, StringTyp...
复制链接

扫一扫