电商商品数据生成及统计
生成电商商品数据集
1.产品名称Username
例如:联想lenovo,惠普hp,戴尔dell,苹果apple,华硕asus,华为huawei,小米mi 等
2浏览量pv
根据用户的访问量,一天之内客户端只被计算一次。
3访客量uv
用户点击主页或产品时被记录,一天之内客户端只被计算一次。
4登录时间time
记录用户上线的时间。
5交易金额
记录用户一天之内交易的金额总数。
6退货码
购买商品后根据不同的原因选择对应的商品进行退货。
public class Shop {
// 电商商品数据
// 产品名
private String Username;
// 浏览量(PV)
private String pv;
// 访客数(UV)
private String uv;
// 登录时间
private String time;
// 交易金额
private Integer cash;
public Shop(String username, String pv, String uv, String time, Integer cash) {
Username = username;
this.pv = pv;
this.uv = uv;
this.time = time;
this.cash = cash;
}
public String getUsername() {
return Username;
}
public void setUsername(String username) {
Username = username;
}
public String getPv() {
return pv;
}
public void setPv(String pv) {
this.pv = pv;
}
public String getUv() {
return uv;
}
public void setUv(String uv) {
this.uv = uv;
}
public String getTime() {
return time;
}
public void setTime(String time) {
this.time = time;
}
public Integer getCash() {
return cash;
}
public void setCash(Integer cash) {
this.cash = cash;
}
}
创建一个普通实体类来生成商品数据
一个对象序列化的接口,一个类只有实现了Serializable接口,它的对象才是可序列化的。因此如果要序列化某些类的对象,这些类就必须实现Serializable接口。而实际上,Serializable是一个空接口,没有什么具体内容,它的目的只是简单的标识一个类的对象可以被序列化。
public class ShopCanData implements Serializable {
// 产品名
private String Username;
// 浏览量(PV)
private String pv;
// 访客数(UV)
private String uv;
// 登录时间
private String time;
// 交易金额
private Integer cash;
//退货码
private List<Integer> returnList;
public String getUsername() {
return Username;
}
public void setUsername(String username) {
Username = username;
}
public String getPv() {
return pv;
}
public void setPv(String pv) {
this.pv = pv;
}
public String getUv() {
return uv;
}
public void setUv(String uv) {
this.uv = uv;
}
public String getTime() {
return time;
}
public void setTime(String time) {
this.time = time;
}
public Integer getCash() {
return cash;
}
public void setCash(Integer cash) {
this.cash = cash;
}
public List<Integer> getReturnList() {
return returnList;
}
public void setReturnList(List<Integer> returnList) {
this.returnList = returnList;
}
public ShopCanData() {
Username = Username;
this.pv = pv;
this.uv = uv;
this.time = time;
this.cash = cash;
this.returnList = returnList;
}
}
生成数据的效果
创建方法,随机生成商品数据
将浏览量pv和访客量uv随机生成数据
public class ShopTest {
public static void main(String[] args) {
// Random random = new Random();
// random.ints(1).mapToObj(x-> Integer.toString(x>0 ? x:-x,32)).forEach(System.out::print);
System.out.println(pv(6));
System.out.print(uv(6));
}
public static String pv(int num){
Random random = new Random();
String text = random.ints(1).mapToObj(x-> Integer.toString(x>1 ? x:-x,4)).collect(Collectors.joining());
if (num<=6){
return text.substring(num);
}else {
return text + pv(num-2);
}
}
public static String uv(int num){
Random random = new Random();
String text = random.ints(1).mapToObj(x-> Integer.toString(x>1 ? x:-x,5)).collect(Collectors.joining());
if (num<=6){
return text.substring(num);
}else {
return text + uv(num-3);
}
}
}
创建类,将数据写入hdfs
public class WriteHdfsHandler {
private String ShopCanDataDir;
private final String FILE_NAME = "shop_data.json.";
private int curFileNameSuffix = 1;
/**
* 写了多少行数据到当前文件
*/
private int curLineNum = 0;
/**
* 每个文件最大行数,超过,则写入下一个文件
*/
private final int MAX_LINE_NUMS = 1000000;
private FileSystem fs;
public WriteHdfsHandler(String ShopCanDataDir, String defaultFS) throws IOException {
this.ShopCanDataDir = ShopCanDataDir;
循环生成多个电商数据,写入hdfs
获取HDFS文件系统对象,父目录删除,再循环写入新的文件, 创建新的文件,循环将数据写入HDFS, 特殊处理:避免追加操作。伪分布式下,append会报错。
public WriteHdfsHandler(String ShopCanDataDir, String defaultFS) throws IOException {
this.ShopCanDataDir = ShopCanDataDir;
Configuration conf = new Configuration();
conf.set("fs.defaultFS", defaultFS);
fs = FileSystem.get(conf);
boolean exists = fs.exists(new Path(this.ShopCanDataDir));
if (exists) {
fs.delete(new Path(this.ShopCanDataDir), true);
}
fs.mkdirs(new Path(this.ShopCanDataDir));
}
public String getCurCanDataFilePath() {
return this.ShopCanDataDir + "/" + FILE_NAME + this.curFileNameSuffix;
}
public void writeCanDataToHdfs(List<ShopCanData> shopDataList) throws IOException {
if(shopDataList.size() == 0) {
return ;
}
int curHandedIndex = 0;
while(curHandedIndex < shopDataList.size()) {
if (curLineNum >= MAX_LINE_NUMS) {
this.curFileNameSuffix += 1;
curLineNum = 0;
}
String curCanDataFilePath = getCurCanDataFilePath();
FSDataOutputStream out = null;
if(fs.exists(new Path(curCanDataFilePath))) {
out = fs.append(new Path(curCanDataFilePath));
} else {
out = fs.create(new Path(curCanDataFilePath));
}
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out));
Gson gson = new Gson();
for(; curHandedIndex < shopDataList.size() && curLineNum < MAX_LINE_NUMS; curHandedIndex++) {
ShopCanData Shop = shopDataList.get(curHandedIndex);
writer.write(gson.toJson(Shop)+"\r\n");
curLineNum ++;
}
writer.close();
}
if(this.curLineNum > 0 && this.curLineNum < MAX_LINE_NUMS) {
this.curFileNameSuffix ++;
curLineNum = 0;
}
}
public void close() throws IOException {
fs.close();
}
}
统计各个商品退货数最多的前十个商品
读当天的数据文件,得到一个RDD
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf().setAppName("123").setMaster("local").set("spark.driver.host","localhost");
JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);
JavaRDD<String> text = sparkContext.textFile("hdfs://192.168.53.3:9000/shop_data/2023-11-05/*");
将元素为String类型的RDD转成ShopCanData类型的RDD
List<Tuple2<String, List<Tuple2<String,Integer>>>> re = text.map(new Function<String, ShopCanData>() {
@Override
public ShopCanData call(String s) throws Exception {
Gson gson = new Gson();
return gson.fromJson(s,ShopCanData.class);
}
})
将ShopCanData类型的RDD转成键值对RDD
.mapToPair(new PairFunction<ShopCanData, String, ShopCanData>() {
@Override
public Tuple2<String, ShopCanData> call(ShopCanData ShopCanData) throws Exception {
return new Tuple2(ShopCanData.getUsername(),ShopCanData);
}
})
.filter(t -> t._2.getTime().split(" ")[0].equals("2023-11-05"))
分组:将相同退货码的数据合并在一起
.mapToPair(new PairFunction<ShopCanData, String, ShopCanData>() {
@Override
public Tuple2<String, ShopCanData> call(ShopCanData ShopCanData) throws Exception {
return new Tuple2(ShopCanData.getUsername(),ShopCanData);
}
})
.filter(t -> t._2.getTime().split(" ")[0].equals("2023-11-05"))
.groupByKey()
使用map算子转换分组后的键值对RDD,将每一个键值对转换一个Tuple2对象
.map(new Function<Tuple2<String, Iterable<ShopCanData>>, Tuple2<String, List<Tuple2<String,Integer>>>>() {
@Override
public Tuple2<String, List<Tuple2<String,Integer>>> call(Tuple2<String, Iterable<ShopCanData>> t) throws Exception {
HashMap<String,Integer> canMap=new HashMap<>();
List<Tuple2<String,Integer>> canList = new ArrayList<>();
int topNum = 10;
//统计同车型不同车架号的故障数
for (ShopCanData one:t._2){
String typeAndVin = one.getUsername()+"_"+one.getPv();
if (canMap.containsKey(typeAndVin)){
canMap.put(typeAndVin,canMap.get(typeAndVin)+one.getReturnList().size());
}else {
canMap.put(typeAndVin,one.getReturnList().size());
}
}
获取10个商品并并将结果输出到控制台
Set<String> keys = canMap.keySet();
for (String key:keys){
canList.add(new Tuple2<>(key,canMap.get(key)));
}
//故障数排序
canList.sort(new Comparator<Tuple2<String, Integer>>() {
@Override
public int compare(Tuple2<String, Integer> o1, Tuple2<String, Integer> o2) {
int o1Num = o1._2.intValue();
int o2Num = o2._2.intValue();
return o1Num >= o2Num ? (o1Num > o2Num ? -1 :0) : 1;
}
});
//只选取前N名的数据
if (canList.size()>topNum)
canList = new ArrayList<>(canList.subList(0,topNum));
return new Tuple2(t._1,canList);
}
})
.collect();
// .sortBy(t->t._2,false,2)
// .filter(t->"E100".equals(t._1.split("_")[0]))
// .take(3)
// .forEach(System.out::println);
用filter算子将商品名称的前十个商品总数量输出到控制台
package qimotest;
import qimotest.ShopCanData;
import com.google.gson.Gson;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
import java.util.*;
public class ReturnCodeCount {
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf().setAppName("123").setMaster("local").set("spark.driver.host","localhost");
JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);
JavaRDD<String> text = sparkContext.textFile("hdfs://192.168.53.3:9000/shop_data/2023-11-05/*");
List<Tuple2<String, List<Tuple2<String,Integer>>>> re = text.map(new Function<String, ShopCanData>() {
@Override
public ShopCanData call(String s) throws Exception {
Gson gson = new Gson();
return gson.fromJson(s,ShopCanData.class);
}
})
.mapToPair(new PairFunction<ShopCanData, String, ShopCanData>() {
@Override
public Tuple2<String, ShopCanData> call(ShopCanData ShopCanData) throws Exception {
return new Tuple2(ShopCanData.getUsername(),ShopCanData);
}
})
.filter(t -> t._2.getTime().split(" ")[0].equals("2023-11-05"))
.groupByKey()
.map(new Function<Tuple2<String, Iterable<ShopCanData>>, Tuple2<String, List<Tuple2<String,Integer>>>>() {
@Override
public Tuple2<String, List<Tuple2<String,Integer>>> call(Tuple2<String, Iterable<ShopCanData>> t) throws Exception {
HashMap<String,Integer> canMap=new HashMap<>();
List<Tuple2<String,Integer>> canList = new ArrayList<>();
int topNum = 10;
//统计同车型不同车架号的故障数
for (ShopCanData one:t._2){
String typeAndVin = one.getUsername()+"_"+one.getPv();
if (canMap.containsKey(typeAndVin)){
canMap.put(typeAndVin,canMap.get(typeAndVin)+one.getReturnList().size());
}else {
canMap.put(typeAndVin,one.getReturnList().size());
}
}
//将故障数Map转换为list
Set<String> keys = canMap.keySet();
for (String key:keys){
canList.add(new Tuple2<>(key,canMap.get(key)));
}
//故障数排序
canList.sort(new Comparator<Tuple2<String, Integer>>() {
@Override
public int compare(Tuple2<String, Integer> o1, Tuple2<String, Integer> o2) {
int o1Num = o1._2.intValue();
int o2Num = o2._2.intValue();
return o1Num >= o2Num ? (o1Num > o2Num ? -1 :0) : 1;
}
});
//只选取前N名的数据
if (canList.size()>topNum)
canList = new ArrayList<>(canList.subList(0,topNum));
return new Tuple2(t._1,canList);
}
})
.collect();
// .sortBy(t->t._2,false,2)
// .filter(t->"E100".equals(t._1.split("_")[0]))
// .take(3)
// .forEach(System.out::println);
for (Tuple2<String, List<Tuple2<String,Integer>>> one :re){
System.out.printf("%s商品发生退货的前10个商品:\n",one._1);
for (Tuple2 two : one._2){
System.out.println(two);
}
}
sparkContext.close();
}
}