在此练习之前,要了解二次排序的使用。
java代码示例
package secondary;
import scala.math.Ordered;
import java.io.Serializable;
/**
* Created by Administrator on 2018/1/18.
*/
public class SecondarySortKey implements Ordered<SecondarySortKey>,Serializable {
private int first;
private int second;
public int getFirst() {
return first;
}
public void setFirst(int first) {
this.first = first;
}
public int getSecond() {
return second;
}
public void setSecond(int second) {
this.second = second;
}
public SecondarySortKey(int first, int second) {
this.first = first;
this.second = second;
}
@Override
public int compare(SecondarySortKey that) {
if(this.first - that.first != 0){
return this.first - that.first;
}else{
return this.second - that.second;
}
}
@Override
public boolean $less(SecondarySortKey that) {
if(this.first < that.first){
return true;
}else if(this.first == that.first && this.second < that.second){
return true;
}else{
return false;
}
}
@Override
public boolean $greater(SecondarySortKey that) {
// if(this.first > that.first){
// return true;
// }else if(this.first == that.first && this.second > that.second){
// return true;
// }else{
// return false;
// }
return !$less(that);
}
@Override
public boolean $less$eq(SecondarySortKey that) {
if(this.$less(that)){
return true;
}else if(this.first == that.first && this.second == that.second){
return true;
}else{
return false;
}
}
@Override
public boolean $greater$eq(SecondarySortKey that) {
if(this.$greater(that)){
return true;
}else if(this.first == that.first && this.second == that.second){
return true;
}else{
return false;
}
}
@Override
public int compareTo(SecondarySortKey that) {
if(this.first - that.first != 0){
return this.first - that.first;
}else{
return this.second- that.second;
}
}
}
Test:
package secondary;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
/**
* Created by Administrator on 2018/1/18.
*/
public class SortSecondTest {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local").setAppName("Test");
JavaSparkContext sc = new JavaSparkContext(conf);
//3,4 sortByKey(自定义的key,line)=>
sc.textFile("D:\\sort.txt")
.mapToPair(new PairFunction<String, SecondarySortKey, String>() {
//ctrl + i
@Override
public Tuple2<SecondarySortKey, String> call(String line) throws Exception {
final String[] fields = line.split(",");
final SecondarySortKey secondarySortKey = new SecondarySortKey(Integer.parseInt(fields[0]), Integer.parseInt(fields[1]));
return new Tuple2<>(secondarySortKey,line);
}
}).sortByKey(false)
.foreach(new VoidFunction<Tuple2<SecondarySortKey, String>>() {
@Override
public void call(Tuple2<SecondarySortKey, String> tuple) throws Exception {
System.out.println(tuple._2());
}
});
}
}
接下来是scala代码示例:
package core.secondary
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by Administrator on 2018/1/18.
*/
class SecondarySortKey(var first:Int,var second:Int) extends Ordered[SecondarySortKey] with Serializable{
override def compare(that: SecondarySortKey): Int = {
if(this.first - that.first != 0){
return this.first - that.first
}else {
return this.second - that.second
}
}
}
object Test{
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("test").setMaster("local")
val sc = new SparkContext(conf)
sc.textFile("D:\\sort.txt")
.map( line => {
val fields = line.split(",")
(new SecondarySortKey(fields(0).toInt,fields(1).toInt),line)
}).sortByKey()
.foreach( result =>{
println(result._2)
})
}
}
是不是很简洁呢?
好啦,现在我们就可以做这个综合练习啦。
需求及需求分析:
题:手机端美团APP(饿了吗,百度外卖)
指标:
1)获取 【点击】,【下单】,【支付】次数排名前十的品类(二次排序)
数据是存储在HDFS之上的(已经有人经过了ETL的操作了)
数据是按天进行存储(一个目录就是一天)
date:
日期 2018 01 18
user_id:
用户id
session_id
会话ID
page_id
页面
action_id
访问的时间
city_id
访问用户所在的城市
search_keywords
用户搜索的时候写的关键词
【click_category_id】 这个地方只会有一个品类 1
用户点击的品类的ID
click_product_id
用户点击的产品ID
【order_category_id】 如果是下单有可能是多个品类 ,如果是多个品类是 1^A2
用户下单的品类id
order_product_id
用户下单的产品id
【pay_category_id】
用户支付的品类id 如果是支付有可能是多个品类 ,如果是多个品类是 1^A2^A3
pay_product_id
用户支付的产品ID
===============================================================================================
思路分析:
获取 【点击】,【下单】,【支付】次数排名前十的品类
关键字:【点击】,【下单】,【支付】 次数 【品类】
重点解释:点击、下单、支付是没有任何联系的,不是我们想象中的只有点击了,才有下单的可能,
只有下单了才有支付的可能,现实业务处理就是这样。但是我们的需求是按照【点击】,
【下单】,【支付】次数排名,如果点击相同,按下单,下单还想相同的话按支付 排序。
教室里面:
1 2 3 4
2 2 2 2
3 3 2 1
需要我们把数据处理成如下的格式:如何转换成这样的格式?
1 2 3 4
2 2 2 2
3 3 2 1
1)获取到数据里面所涉及到的所有的品类的ID RDD[(ID,ID)] rdd1
如何获取到所有的品类ID?
a)获取到所有点击的品类 id rdda
b) 获取到所有的下单的品类id rddb
c)获取到所有的支付的品类 id rddc
(rdda union rddb union rddc).distinct 思路一定要这样!!!
ID,ID
2)分别计算出来每个品类的:
点击的次数 1 2 单词计数 rdd2
下单的次数 1 3 单词计数 rdd3
支付的次数 1 4 单词计数 rdd4
3)
rdd1.leftjoin(rdd2).leftjoin(rdd3).leftjoin(4)
1 ) 如果能join上的 该是都是次就是多少次。 如果join不上的就是出现零次。
直接上代码,自己领悟:
排序规则代码:
package core.demo3
/**
* Created by Administrator on 2018/1/19.
*/
class SortKey(var clickCount:Long,var orderCount:Long,var payCount:Long) extends Ordered[SortKey] with Serializable{
override def compare(that: SortKey): Int = {
if(this.clickCount - that.clickCount != 0){
if(this.clickCount - that.clickCount < 0){
-1
}else{
1
}
}else if(this.orderCount - that.orderCount != 0){
if(this.orderCount - that.orderCount <0){
-1
}else{
1
}
}else{
if(this.payCount - that.payCount < 0){
-1
}else{
1
}
}
}
}
业务代码:
package core.demo3
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable
/**
* Created by Administrator on 2018/1/18.
*/
object TopN {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("topN").setMaster("local")
val sc = new SparkContext(conf)
val rdd: RDD[String] = sc.textFile("hdfs://hadoop1:9000/20180909")
/**
* 第一步:
* 获取到所有的品类的ID
*/
val allCategoryIDS: RDD[(Long, Long)] = getAllCategoryID(rdd)
/**
* 第二步:
* 分别获取到点击,下单,支付 品类 的次数
*/
val clickCategoryCount: RDD[(Long, Long)] = getClickCategoryCount(rdd)
val orderCategoryCount: RDD[(Long, Long)] = getOrderCategoryCount(rdd)
val payCategoryCount: RDD[(Long, Long)] = getPayCategoryCount(rdd)
/**
* 第三步:
* 用所有品类的ID的RDD 分别与 第二步求到的三个值进行leftjoin 最终
* 拼接处形如:
* 1 2 3 4
2 2 2 2
3 3 2 1
格式
*/
val resultRDD: RDD[(Long, String)] = joinCategoryAndData(allCategoryIDS,clickCategoryCount,orderCategoryCount,payCategoryCount)
/**
* 第四步:
* 自定义二次排序的key,并实现二次排序,求得排名前十的数据
*/
top10(resultRDD)
}
/**
*
* @param rdd 输入的数据
* @return 返回的品类ID
*/
def getAllCategoryID(rdd: RDD[String]):RDD[(Long,Long)]={
//里面存储的是品类的ID
val ids= new mutable.HashSet[(Long,Long)]
rdd.flatMap( line =>{
val fields = line.split(",")
val click_category_id=fields(7)
val order_category_id=fields(9)
val pay_cagegory_id=fields(11)
//添加点击品类ID
if(click_category_id != null && !click_category_id.trim.equals("")){
ids +=((click_category_id.toLong,click_category_id.toLong))
}
//添加下单品类ID
if(order_category_id != null && !order_category_id.trim.equals("")){
val fields = order_category_id.split("\\^A")
for(categoryid <- fields){
ids+=((categoryid.toLong,categoryid.toLong))
}
}
//添加支付品类ID
if(pay_cagegory_id != null && !pay_cagegory_id.trim.equals("")){
val fields = pay_cagegory_id.split("\\^A")
for(categoryid <- fields){
ids+=((categoryid.toLong,categoryid.toLong))
}
}
ids
})
}
/**
* 统计点击品类出现的次数
* @param rdd 输入的数据
* @return 返回品类id,出现的次数
*/
def getClickCategoryCount(rdd: RDD[String]):RDD[(Long,Long)]={
rdd.filter( line =>{
val fields = line.split(",")
fields(7) != null && !fields(7).trim.equals("")
}).map( line =>{
val click_category_ID = line.split(",")(7).toLong
(click_category_ID,1L)
}).reduceByKey(_+_)
}
/**
* 统计下单品类出现的次数
* @param rdd 输入的数据
* @return 返回品类id,出现的次数
*/
def getOrderCategoryCount(rdd: RDD[String]):RDD[(Long,Long)]={
rdd.filter( line =>{
val fields = line.split(",")
fields(9) != null && !fields(9).trim.equals("")
}).flatMap( line =>{
line.split(",")(9).split("\\^A")
}).map( categoryid =>{
(categoryid.toLong,1L)
}).reduceByKey(_+_)
}
/**
* 统计支付品类的ID出现的次数
* @param rdd 输入的数据
* @return 返回值,品类id,出现的次数
*/
def getPayCategoryCount(rdd: RDD[String]):RDD[(Long,Long)]= {
rdd.filter( line =>{
val fields = line.split(",")
fields(11) != null && !fields(11).trim.equals("")
}).flatMap( line =>{
line.split(",")(11).split("\\^A")
}).map( categoryid =>{
(categoryid.toLong,1L)
}).reduceByKey(_+_)
}
def joinCategoryAndData(
allCategoryIDS: RDD[(Long, Long)],
clickCategoryCount: RDD[(Long, Long)],
orderCategoryCount: RDD[(Long, Long)],
payCategoryCount: RDD[(Long, Long)]):RDD[(Long, String)]={
/**
* (Long, 品类ID
* (Long, 品类ID
* Option[Long]) 出现的次数
* )
*/
val resultRDD: RDD[(Long, String)] = allCategoryIDS.leftOuterJoin(clickCategoryCount)
.map(tuple => {
val category_id = tuple._1.toLong
val clickCategoryCount = tuple._2._2.getOrElse(0)
//
val value = constants.FIELDS_CATEGORY_ID + "=" + category_id + "|" + "click_category_count=" + clickCategoryCount
//value= [categoryid=1|click_category_count=33]|order_category_count=44|pay_category_count=55
(category_id, value)
}).leftOuterJoin(orderCategoryCount).map(tuple => {
val category_id = tuple._1.toLong
var value = tuple._2._1
val orderCategoryCount = tuple._2._2.getOrElse(0)
value += "|" + "order_category_count" + "=" + orderCategoryCount
(category_id, value)
}).leftOuterJoin(payCategoryCount).map(tuple => {
val categoryid = tuple._1.toLong
var value = tuple._2._1
val payCategoryCount = tuple._2._2.getOrElse(0)
value += "|" + "pay_category_count" + "=" + payCategoryCount
(categoryid, value)
})
resultRDD
}
def top10(resultRDD:RDD[(Long,String)]):Unit={
resultRDD.map( tuple =>{
val category_id = tuple._1
//[categoryid=1|click_category_count=33]|order_category_count=44|pay_category_count=55
val value = tuple._2
val click_category_count = value.split("\\|")(1).split("=")(1).toLong
val order_category_count = value.split("\\|")(2).split("=")(1).toLong
val pay_category_count = value.split("\\|")(3).split("=")(1).toLong
val key=new SortKey(click_category_count,order_category_count,pay_category_count)
(key,value)
}).sortByKey(false)
.take(10)
}
}