练习1
1、求出哪些省份没有农产品市场
2、获取菜的种类最多的三个省份
3、获取每个省份菜的种类最多的三个农贸市场
//数据实例:
//省份:
/*
*山西
*辽宁
*吉林
*/
//菜品记录:
/*
*香菜 2.80 2018/1/1 山西汾阳市晋阳农副产品批发市场 山西 汾阳
*大葱 2.80 2018/1/1 山西汾阳市晋阳农副产品批发市场 山西 汾阳
*葱头 1.60 2018/1/1 山西汾阳市晋阳农副产品批发市场 山西 汾阳
*/
package com.test
import scala.io.Source
object Test {
def main(args: Array[String]): Unit = {
val source1 = Source.fromFile("D:\\省份数据.txt", "utf-8")
val source2 = Source.fromFile("D:\\菜品.txt", "utf-8")
val province = source1.getLines().toList
val products = source2.getLines().toList
println(m1(province, products))
println(m2(products))
m3(products).foreach(println)
source1.close()
source2.close()
}
/**
* 1、求出哪些省份没有农产品市场
*/
def m1(province: List[String], product: List[String]) = {
//集合的差集
province.diff(
//过滤数据
product.filter(_.split("\t").length == 6)
//找处省份列
.map(_.split("\t")(4))
//去重
.distinct
)
}
/**
* 2、获取菜的种类最多的三个省份
*/
def m2(product: List[String]) = {
//过滤数据
product.filter(_.split("\t").length == 6)
//找处数据:(省份,菜)
.map(x => {
val lines = x.split("\t")
(lines(4), lines(0))
})
//去重
.distinct
//根据省份分组
.groupBy(_._1)
//计算省份的菜品数
.map(x => {
(x._1, x._2.length)
})
//转list
.toList
//排序,反转,取前三个
.sortBy(_._2).reverse.take(3)
}
/**
* 3、获取每个省份菜的种类最多的三个农贸市场
*/
def m3(products: List[String]) = {
//过滤
products.filter(_.split("\t").length == 6)
//取出列封装为元组:(省份,市场名,菜名)
.map(x => {
val line = x.split("\t")
(line(4), line(3), line(0))
})
//根据(省份,市场)分组
.groupBy(x => {
(x._1, x._2)
}).toList
//去重
.distinct
//计算(省,店)的菜品数量
.map(x => {
(x._1, x._2.length)
})
//再根据省分组
.groupBy(_._1._1)
//再对数据进行排序,反转,取前三
.map(x => {
(x._1, x._2.sortBy(_._2).reverse.take(3))
})
}
}
练习2
需求: 每个区域的平均等客时间[每个区域所有司机的等客时间平均值]
司机ID 上车区域ID 下次区域ID 上车时间 下车时间
def main(args: Array[String]): Unit = {
val datas = List(
"A 龙华区 宝安区 2020-07-15 10:05:10 2020-07-15 10:25:02",
"B 宝安区 福田区 2020-07-15 11:43:22 2020-07-15 11:55:45",
"A 龙岗区 宝安区 2020-07-15 11:55:55 2020-07-15 12:12:23",
"B 福田区 宝安区 2020-07-15 12:05:05 2020-07-15 12:22:33",
"A 龙岗区 龙华区 2020-07-15 11:02:08 2020-07-15 11:17:15",
"A 宝安区 龙岗区 2020-07-15 10:35:15 2020-07-15 10:40:50",
"B 龙华区 龙岗区 2020-07-15 10:45:25 2020-07-15 10:50:00",
"A 龙华区 龙岗区 2020-07-15 11:33:12 2020-07-15 11:45:35",
"B 宝安区 龙岗区 2020-07-15 12:27:20 2020-07-15 12:43:31",
"A 宝安区 龙岗区 2020-07-15 12:17:10 2020-07-15 12:33:21",
"B 福田区 龙华区 2020-07-15 10:15:21 2020-07-15 10:35:12",
"B 龙岗区 宝安区 2020-07-15 11:12:18 2020-07-15 11:27:25")
datas.map(x => {
val strings = x.split("\t")
val format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
(strings(0),
strings(1),
strings(2),
format.parse(strings(3)).getTime / 1000,
format.parse(strings(4)).getTime / 1000)
})
.groupBy(_._1)
.map(x => {
x._2.sortBy(_._4).sliding(2).toList
.map(x => {
val l = x.last._4 - x.head._5
(x.head._3, l)
})
}).flatten.toList
.groupBy(x => {
x._1
})
.map(x => {
(x._1, x._2.map(_._2).sum / x._2.length)
})
.foreach(println)
}
练习3
需求: 统计每个用户一小时内的最大登录次数
user_id,login_time
def main(args: Array[String]): Unit = {
val datas = List(
"a,2020-07-11 10:51:12",
"a,2020-07-11 11:05:00",
"a,2020-07-11 11:15:20",
"a,2020-07-11 11:25:05",
"a,2020-07-11 11:45:00",
"a,2020-07-11 11:55:36",
"a,2020-07-11 11:59:56",
"a,2020-07-11 12:35:12",
"a,2020-07-11 12:58:59",
"b,2020-07-11 14:51:12",
"b,2020-07-11 14:05:00",
"b,2020-07-11 15:15:20",
"b,2020-07-11 15:25:05",
"b,2020-07-11 16:45:00",
"b,2020-07-11 16:55:36",
"b,2020-07-11 16:59:56",
"b,2020-07-11 17:35:12",
"b,2020-07-11 17:58:59")
datas.map(x => {
val strings = x.split(",")
val user = strings(0)
val timeStr = strings(1)
val date = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").parse(timeStr)
(user, date)
})
//按照时间升序排序
.sortBy(_._2)
//根据用户分组
.groupBy(_._1)
.map(x => {
//用户id
val id = x._1
//用户的登录记录list
val list = x._2
val result = list.map(x => {
//list中的登录时间
val startTime = x._2.getTime
//计算list中的登录时间跟上面时间比较少于一个小时的个数
list.count(f => {
val now = f._2.getTime
now - startTime >= 0 && now - startTime < 1000 * 60 * 60
})
//取最大值
}).max
(id, result)
})
.foreach(println)
}
练习4
统计每个用户每个会话(超过半个小时没有登录,归为下一个会话)内的行为轨迹
object Test {
/**
* 统计每个用户每个会话(超过半个小时没有登录,归为下一个会话)内的行为轨迹
* 用户id 操作时间 操作页面
*/
def main(args: Array[String]): Unit = {
val list = List[(String, String, String)](
("1001", "2020-09-10 10:21:21", "home.html"),
("1001", "2020-09-10 10:28:10", "good_list.html"),
("1001", "2020-09-10 10:35:05", "good_detail.html"),
("1001", "2020-09-10 10:42:55", "cart.html"),
("1001", "2020-09-10 11:35:21", "home.html"),
("1001", "2020-09-10 11:36:10", "cart.html"),
("1001", "2020-09-10 11:38:12", "trade.html"),
("1001", "2020-09-10 11:40:00", "payment.html"),
("1002", "2020-09-10 09:40:00", "home.html"),
("1002", "2020-09-10 09:41:00", "mine.html"),
("1002", "2020-09-10 09:42:00", "favor.html"),
("1003", "2020-09-10 13:10:00", "home.html"),
("1003", "2020-09-10 13:15:00", "search.html"))
//按照用户id分组
list
.map {
case (userId, time, page) => {
//将时间字符转换为Long
val timeCost = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").parse(time).getTime
//为每条记录提供一个唯一id
val id = UUID.randomUUID().toString
//封装数据
UserInfo(id, userId, timeCost, page)
}
}
//按照用户id分组
.groupBy(_.userId)
.map(x => {
//x._2为每个用户的所有操作轨迹,根据时间戳升序排序
x._2.sortBy(_.time)
//再两两开窗
.sliding(2).toList
//遍历它
.foreach(y => {
val head = y.head
val last = y.last
//如果两个记录之间的时间超过半个小时,将第二个的标识id设置为第一个的
if (last.time - head.time <= 60 * 30 * 1000) {
last.id = head.id
}
})
x._2
})
.toList
.flatten
//结果展示,id相同的标识为一个用户的一次会话内的操作轨迹
.foreach(println)
}
case class UserInfo(var id: String, val userId: String, val time: Long, val page: String)
}