sparksql读取mysql数据生成csv文件

一、前言

 软件如下:hadoop 2.7.2 、scala 2.11.11、mysql、spark2.2.1需要提前安装好。

二、主要逻辑

1.mysql如果格式如下

mysql表结构

response字段结构:

{"bulletin":"","couponList":[],"createTime":"","deliveryFee":8,"deliveryMsg":"","deliveryTime":"0","deliveryType":0,"dpShopId":0,"itemList":[{"activityTag":"","activityType":0,"bigImageUrl":"","cActivityTag":"","cTag":"126421929","categoryName":"","categoryType":"0","createTime":"","currentPrice":1.0,"dpShopId":0,"iconUrl":"","littleImageUrl":"","mtWmPoiId":"","originPrice":1.0,"praiseNum":0,"sellStatus":0,"shopName":"**茶餐厅*","spuDesc":"","spuId":1997065665,"spuName":"","spuPromotionInfo":"","statusDesc":"","tag":"126421929","unit":""}],"minFee":20.0,"mtWmPoiId":"*","onlinePay":1,"shipping_time":"","shopName":"**茶餐厅*","shopPic":"","shopStatus":0}

2.spark读取mysql数据保存csv


import java.text.SimpleDateFormat
import java.util.Date

import com.alibaba.fastjson.{JSON, JSONArray}
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}

import scala.collection.mutable.ListBuffer

object SparkSqlShopItemToCsv {

  def main(args: Array[String]): Unit = {
    val session = SparkSession
      .builder()
      .appName("shop")
      .master("local[4]")
      .getOrCreate()
    import session.implicits._
    val recordRDD:DataFrame = session.read.format("jdbc").options(Map("url" -> "jdbc:mysql://localhost/wm", "driver" -> "com.mysql.jdbc.Driver"
      , "dbtable" -> "record", "user" -> "root", "password" -> "123456")).load()
    recordRDD.createOrReplaceTempView("record")
    //读取record表中的数据
    val recordDf = session.sql("select a.response,a.request,a.create_time from record a where a.create_time >'2020-06-13 22:00:30'")

    val currentDate = getNowDate("yyyy-MM-dd HH:mm:ss")
    import scala.collection.JavaConversions._
    //对record 中的数据进行处理
    val ds = recordDf.map(one => {
      //获取response字段的值
      val response = one.getAs[String]("response")
      val recordJsonObject = JSON.parseObject(response)
      val mtWmPoiId = recordJsonObject.getString("mtWmPoiId")
      val shopName = recordJsonObject.getString("shopName")
      //店铺
      val shop = new Shop(mtWmPoiId, shopName, currentDate)
      val itemArray: JSONArray =recordJsonObject.getJSONArray("itemList")
      val itemObjectList: List[AnyRef] = itemArray.iterator().toList

      val itemListBuffer = ListBuffer[Item]()
      val relationListBuffer = ListBuffer[Relation]()
      for(x <-itemObjectList){
        val itemJsonObject = JSON.parseObject(x.toString)
        val spuId = itemJsonObject.getLong("spuId")
        val spuName = itemJsonObject.getString("spuName")
        //商品
        val item = new Item(spuId, spuName, currentDate)
        itemListBuffer.append(item)
        //店铺与商品关系
        val relation = new Relation(mtWmPoiId, shopName, spuId, spuName, currentDate)
        relationListBuffer.append(relation)
      }
      val itemList = itemListBuffer.toList
      val relationList = relationListBuffer.toList
      (shop, itemList, relationList)
    })

    //分别获取出店铺、商品、店铺与商品
    val shopDs:Dataset[Shop] = ds.map(one => {
      one._1
    })
    val itemDs: Dataset[Item] = ds.flatMap(one => {
      one._2
    })
    val relationDs: Dataset[Relation] = ds.flatMap(one => {
      one._3
    })

    //数据去重
    shopDs.createOrReplaceTempView("shop")
    itemDs.createOrReplaceTempView("item")
    relationDs.createOrReplaceTempView("relation")
    
    //写入hdfs 作为csv文件
    val date = getNowDate("yyyyMMdd")
    session.sql("select DISTINCT a.wm_poi_id,a.shop_name,a.create_time from shop a ").coalesce(4).write.option("header", "true").mode("Append").csv("hdfs://ELK01:9000/tmp/csv/"+date+"/shop.csv")
    session.sql("select DISTINCT b.spu_id,b.spu_name,b.create_time from item b ").coalesce(4).write.option("header", "true").mode("Append").csv("hdfs://ELK01:9000/tmp/csv/"+date+"/item.csv")
    session.sql("select DISTINCT c.wm_poi_id,c.shop_name,c.spu_id,c.spu_name,c.create_time from relation c ").coalesce(4).write.option("header", "true").mode("Append").csv("hdfs://ELK01:9000/tmp/csv/"+date+"/relation.csv")
  }

  def getNowDate(dataFormat:String):String={
    var now:Date = new Date()
    var  dateFormat:SimpleDateFormat = new SimpleDateFormat(dataFormat)
    var time = dateFormat.format( now )
    time
  }

  case class Item(spu_id:Long, spu_name:String,create_time:String)

  case class Shop(wm_poi_id:String, shop_name:String,create_time:String)

  case class Relation(wm_poi_id:String, shop_name:String,spu_id:Long,spu_name:String,create_time:String)

}

3.验证

1.hdfs上有数据

2.数据格式

shop数据

item数据

关系数据

3.csv文件用途

批量导入csv文件进neo4j形成shop标签与item标签及之间关联关系。

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值