看书标记【R语言 商务数据分析实战9】

看书标记——关于R语言


【R语言 商务数据分析实战9】


chapter 9

餐饮企业综合分析

统计分析>>ARIMA预测销售额>>协同过滤算法对菜品进行智能推荐>>Apriori算法对菜品进行关联分析>>K-means算法进行客户价值分析>>决策树算法进行客户流失预测
这几种算法是不同从不同方面得到不同的结果的独立模块,根据算法的要求,对数据进行相关的数据预处理

9.2 任务实现

统计餐饮数据

# 【分组聚合:使用aggregate或者split-lapply-cbind模式】

# 设置工作目录
setwd()
# [使用aggregate函数进行分组统计]
# 导入数据
score <- read.csv("./data/score.csv", stringsAsFactors = FALSE)
# 提取score中的gender字段
gender <- list(score$gender)
# 对score1和score2列进行分组统计
aggregate(score[, c(2,3)], gender, mean)
# [使用split-lapply-cbind模式进行分组统计]
# 分组
sp <- split(score, score$gender, drop = TRUE)
# 求score1和score2的均值
score1 <- lapply(sp, FUN = function(x) mean(x$score1))
score2 <- lapply(sp, FUN = function(x) mean(x$score2))
# 合并分组计算结果
result <- cbind(score1, score2 )
result

# 【使用melt和dcast这两个函数实现透视表功能】
library(reshape2)
data(airquality)
airquality <- airquality[29:34, ]
names(airquality) <- tolower(names(airquality))
airquality

# 【用melt转换数据格式】
md <- melt(airquality, id = c("month", "day"))
md
# 还原melt后的数据
cd <- dcast(md, month + day ~ variable)
cd

# 设置工作目录
setwd()
# 【统计每日用餐人数与销售额】
info <- read.csv("./data/meal_order_info.csv")  # 导入数据

info$use_start_time <- as.Date(info$use_start_time)  # 转换时间格式
table(info$order_status)  # 查看订单状态
info <- info[which(info$order_status == 1), ]  # 提取订单状态为1的数据

# 【统计每日用餐人数与营业额】
sale <- aggregate(info[, c(3, 9)], list(info$use_start_time), sum)
colnames(sale) <- c("date", "number", "saleroom")  # 修改列名

# 【导出每日的用餐人数和销售额】
write.csv(sale, "./tmp/sale_day.csv", row.names = FALSE)

# 【每日用餐人数折线图】
sale$date <- as.POSIXct(sale$date)  # 将date字段转换时间格式
plot(sale$date, sale$number, col = "orange", type="b", 
     xlab = "日期", ylab = "用餐人数")
# 画出每日营业额的折线图
plot(sale$date, sale$saleroom, col = "blue", type="o", 
     xlab = "日期", ylab = "营业额")

# 【计算菜品的热销度度】
# 导入数据
detail <- read.csv("./data/meal_order_detail.csv", stringsAsFactors = FALSE)

# 菜品名称删除回车符和空格
head(detail$dishes_name)  # 查看前6个菜品名称
detail$dishes_name <- gsub("\\s|\\n+", "", detail$dishes_name)

# 求出每个菜品的销售量
sales_volume <- aggregate(detail$counts, list(detail$dishes_name), sum)
colnames(sales_volume) <- c("dishes_name", "counts")

# 求出每个菜品的热销度
sales_formula <- (sales_volume$count - min(sales_volume$count)) /
  (max(sales_volume$count) - min(sales_volume$count))
sales_volume$sales_hot <- round(sales_formula, 3)  # 保留3位小数

# 查看热销度最高和最低的菜品
sales_volume[which(sales_volume$sales_hot == max(sales_volume$sales_hot)), ]
sales_volume[which(sales_volume$sales_hot == min(sales_volume$sales_hot)), ]

# 画出热销度最高的前10个菜品的条形图
# 按热销度进行排序
sales_volume <- sales_volume[order(sales_volume$sales_hot, decreasing = TRUE), ]
barplot(sales_volume[1:10, 3], names.arg = sales_volume[1:10, 1], 
        xlab = "菜品名称", ylab = "热销度", col = "blue")
write.csv(sales_volume, "./tmp/sales_volume.csv", row.names = FALSE)  # 导出数据

# 【计算菜品毛利率】
# 导入数据
dish <- read.csv("./data/meal_dishes_detail.csv", stringsAsFactors = FALSE)
dish <- dish[, c(1, 3, 4, 8, 14)]  # 特征选取
# 菜品名称删除回车符
head(dish$dishes_name)
dish$dishes_name <- gsub("\\s|\\n+", "", dish$dishes_name)

# 根据毛利率计算公式求出毛利率,并保留两位小数
dish$rate <- round((dish$price - dish$cost) / (dish$price), 2)

# 找出毛利率最高和最低的菜品
dish[which(dish$rate == max(dish$rate)), ]
dish[which(dish$rate == min(dish$rate)), ]
write.csv(dish, "./tmp/profit.csv", row.names = FALSE)  # 导出数据

构建ARIMA模型

# 设置工作目录并读取数据
# 【检验平稳性和纯随机性】
setwd()
sale <- read.csv("./tmp/sale_day.csv")

saleroom <- ts(sale[1:28, 3])
plot(saleroom, xlab = "时间", ylab = "销售额")  # 绘制时序图
acf(saleroom, lag.max = 30)  # 绘制ACF图

# 差分
saleroom.diff <- diff(saleroom, differences = 2)  # 进行差分
acf(saleroom.diff, lag.max = 30)  # 绘制差分后序列的ACF图
# 单位根检验
library(tseries)
adf.test(saleroom)
Box.test(saleroom, type = "Ljung-Box")  # 纯随机性检验
# 【构建模型】
# BIC图
library(TSA)
# 原序列定阶
saleroom.BIC <- armasubsets(y = saleroom, nar = 5, nma = 5)
plot(saleroom.BIC)
# 差分后的序列定阶
saleroom.diff.BIC <- armasubsets(y = saleroom.diff, nar = 5, nma = 5)
plot(saleroom.diff.BIC)

# 根据BIC图定阶
library(forecast)
# 初始化
checkout <- data.frame(p = 0, d = 0, q = 0, P = 0, D = 0, 
                       Q = 0, "残差P值" = 0, "平均误差" = 0)
test_checkout <- data.frame(p = 0, d = 0, q = 0, P = 0, D = 0, 
                            Q = 0, "残差P值" = 0, "平均误差" = 0)
j <- 1

test_model <- function(p, q, P, Q){
  model <- Arima(saleroom, order = c(p, 0, q),
                 seasonal = list(order = c(P, 2, Q), period = 7))
  result <- Box.test(model$residuals, type = "Ljung-Box")
  # 预测
  sale.forecast <- forecast(model, h = 3, level = c(99.5))
  # 计算平均绝对百分误差
  error <- abs(as.numeric(sale.forecast[[4]]) - sale[29:31,3]) / sale[29:31,3]
  p.value <- round(result$p.value, 4)
  print(paste('p=', p, ';q=', q, ';P=', P,',Q=', Q, ';残差P值:',
              p.value, ';平均误差:', mean(error), collapse = ""))
  test_checkout[1,1] <- p
  test_checkout[1,2] <- 0
  test_checkout[1,3] <- q
  test_checkout[1,4] <- P
  test_checkout[1,5] <- 2
  test_checkout[1,6] <- Q
  test_checkout[1,7] <- round(result$p.value, 4)
  test_checkout[1,8] <- mean(error) 
  return(test_checkout)
}

for (p in c(0,3,4,5)) {
  if (p == 0 | p == 3) {
    for (q in 1:5) {
      for (P in c(0,1)) {
        for (Q in c(1,2,3,5)) {
          test_checkout <- test_model(p, q, P, Q)
          checkout[j, ] <- test_checkout[1, ]
          j <- j + 1
        }
      }
    }
  }
  if (p == 4) {
    for (q in 1:5) {
      if (q == 1) {
        for (Q in c(1,2,3,5)) {
          test_checkout <- test_model(p, q, 1, Q)
          checkout[j, ] <- test_checkout[1, ]
          j <- j + 1
        }
      }
      if (q != 1) {
        for (Q in c(1,2,3,5)) {
          test_checkout <- test_model(p, q, 0, Q)
          checkout[j, ] <- test_checkout[1, ]
          j <- j + 1
        }
      }
    }
  }
  if (p == 5) {
    for (q in 1:5) {
      for (Q in c(1,2,3,5)) {
        test_checkout <- test_model(p, q, 0, Q)
        checkout[j, ] <- test_checkout[1, ]
        j <- j + 1
      }
    }
  }
}
write.csv(checkout, "./tmp/checkout.csv", row.names = F)  # 导出每个模型的结果


# 取最优模型预测
model <- Arima(saleroom, order = c(0,0,1), 
               seasonal = list(order = c(0,2,2), period = 7))
summary(model)

Box.test(model$residuals, type = "Ljung-Box")  # 纯随机性检验

# 预测未来3天的销售额
sale.forecast <- forecast(model, h = 3, level = c(99.5))
plot(sale.forecast)

# 计算平均误差
error <- abs(as.numeric(sale.forecast[[4]]) - sale[29:31,3]) / sale[29:31,3]
mean(error)

使用协同过滤算法实现菜品的智能推荐

# 设置工作目录并读取数据
setwd()
info <- read.csv("./data/meal_order_info.csv", stringsAsFactors = FALSE)
detail <- read.csv("./data/meal_order_detail.csv", stringsAsFactors = FALSE)

# 数据预处理
# 菜品名称删除回车符和空格
head(detail$dishes_name)
detail$dishes_name <- gsub("\\s|\\n+", "", detail$dishes_name)
# 删除白饭的记录
detail <- detail[-which(detail$dishes_name == "白饭/小碗" | 
                          detail$dishes_name == "白饭/大碗"), ]

# 查看订单状态
table(info$order_status)
# 统计订单状态为0或2的订单占比
info.id <- info[which(info$order_status == 0 | info$order_status == 2), "info_id"]
proportion <- length(info.id) / nrow(info)
proportion
# 删除detail数据中无意义的订单
detail <- detail[-which(detail$order_id %in% info.id), ]
# 提取订单状态为1的数据
info <- info[which(info$order_status == 1), ]

# 特征选取
info <- info[, c(1:3, 8:10, 12, 19:21)]
detail <- detail[, c(1:3, 6, 8, 9, 11, 19)]

# 写出数据归约后的订单表和订单详情表
write.csv(info, "./tmp/info_clear.csv", row.names = FALSE)
write.csv(detail, "./tmp/detail_clear.csv", row.names = FALSE)

# 【构建两个推荐模型】
# 基于物品的协同过滤
require(recommenderlab)
# 将用户ID和菜品名称转换为0-1二元型数据,即模型的输入数据集
dishes <- as(detail[, c(8, 4)], "binaryRatingMatrix") 
write.csv(as(dishes, "matrix"), "./tmp/dishes_matrix.csv")  # 导出二元型矩阵

model.IBCF <- Recommender(dishes, method = "IBCF")  # 建模

# 导出相似度矩阵
dishes.model.sim <- as(model.IBCF@model$sim, "matrix")
write.csv(dishes.model.sim, "./tmp/dishes_model_sim.csv")


# 利用模型对原始数据集进行预测并获得推荐长度为30的结果
recommend.IBCF <- predict(model.IBCF, dishes, n = 30)  # 推荐列表
as(recommend.IBCF, "list")[1:5]  # 查看前五个用户的推荐

# 将结果保存至工作目录下的文件中,需要将结果转换为list型。
# 对list型结果采用sink与print命令将其保存
sink("./tmp/recommend_IBCF.txt")
print(as(recommend.IBCF, "list"))
sink()


# 基于用户的协同过滤
model.UBCF <- Recommender(dishes, method = "UBCF")  # 建模
# 利用模型对原始数据集进行预测并获得推荐长度为30的结果
recommend.UBCF <- predict(model.UBCF, dishes, n = 30)  # 推荐列表
as(recommend.UBCF, "list")[1:5]  # 查看前五个用户的推荐

# 将结果保存至工作目录下的文件中,需要将结果转换为list型。
# 对list型结果采用sink与print命令将其保存
sink("./tmp/recommend_UBCF.txt")
print(as(recommend.UBCF, "list"))
sink()

# 【离线评价两个推荐模型】
# 评价模型
algorithms <- list("ItemCF" = list(name = "IBCF", param = NULL),
                   "UserCF" = list(name = "UBCF", param = NULL))
# 将数据以交叉检验划分成10份,9份训练,1份测试
dishes.es <- evaluationScheme(dishes, method = "cross-validation", k = 10, given = 1)
# 采用算法列表对数据进行模型预测与评价,其推荐值n取15, 20, 25, 30, 35
results <- evaluate(dishes.es, algorithms, n = c(15, 20, 25, 30, 35))

# 画出评价结果的图形
plot(results, "prec/rec", legend = "topleft", cex = 0.67)

# 构建F1的评价指标
fvalue <- function(p, r) {
  return(2 * p * r / (p + r))
}

# 求两个模型的各个评价指标的均值,并将其转换为数据框的形式
library(plyr)
index <- ldply(avg(results))

# 将指标第一列有关于模型的名字重新命名
index[, 1] <- paste(index[, 1], c(15, 20, 25, 30, 35))

# 计算两个模型的F1的指标,并将所有指标综合
F1 <- fvalue(index[, 6], index[, 7])
dishes.Fvalue <- cbind(index, F1)
# 对评价指标值只保留3位小数
for (i in 2:ncol(dishes.Fvalue)) {
  dishes.Fvalue[, i] <- round(dishes.Fvalue[, i], 3)
}
write.csv(dishes.Fvalue, "./tmp/dishes_predict_index.csv", row.names = FALSE)

使用Apriori算法实现菜品的关联分析

# 【构建购物篮数据】
# 设置工作目录并读取数据
setwd()
info <- read.csv("./tmp/info_clear.csv", stringsAsFactors = FALSE)
detail <- read.csv("./tmp/detail_clear.csv", stringsAsFactors = FALSE)


# 建立aliment列表,每个列表代表一个订单的菜品
order.id <- unique(detail$order_id)  # 对id去重
aliment <- list()
for(i in 1:length(order.id)){
  aliment[[i]] <- detail[which(detail$order_id == order.id[i]), 4]
  aliment[[i]] <- unique(aliment[[i]])  # 去掉出现重复的事务
}

# 导出购物篮数据
require(plyr)
aliment1 <- ldply(aliment, rbind)  # 将列表转为数据框
row.names(aliment1) <- as.character(order.id)  # 修改数据框列名
write.csv(aliment1, "./tmp/aliment.csv")

# 【构建二元矩阵和Apriori模型】
# 创建购物篮的二元矩阵
col <- levels(as.factor(unlist(aliment)))  # 提取aliment列表中每个菜的菜名
ruleData <- matrix(FALSE, length(aliment), length(col))  # 创建一个空的矩阵
colnames(ruleData) <- col  # 修改ruleData的列名
row.names(ruleData) <- as.character(order.id)  # 修改ruleData的行名
# 每一个订单中所包含的菜改为1
for(i in 1:length(aliment)){
  ruleData[i, match(aliment[[i]], col)] <- TRUE
}

write.csv(ruleData, "./tmp/ruleData.csv", row.names = FALSE)  # 导出二元矩阵


# 构建关联规则模型
library(arules)
# 把数据转换成关联规则需要的数据类型
trans <- as(aliment, "transactions")
# 或直接使用ruleData数据进行建模
# trans <- read.csv("./tmp/ruleData.csv", stringsAsFactors = FALSE)

# 查看数据集前5行数据
inspect(trans[1:5])
# 生成关联规则
rules <- apriori(trans, parameter = list(support = 0.01, confidence = 0.5))
summary(rules)
inspect(sort(rules, by = list('support'))[1:10])  # 查看前10个支持度较高的规则

# 绝对数量显示
itemFrequencyPlot(trans, type = 'absolute', topN = 10, horiz = T)

# 查看前项为"芹菜炒腰花" 的规则
item <- subset(rules, subset = rhs %in% "芹菜炒腰花")
inspect(sort(item, by = "support"))

# 导出规则数据
write(item, "./tmp/item.csv", sep = ",", row.names = FALSE)
write(rules, "./tmp/rules.csv", sep = ",", row.names = FALSE)

# 【提取规则的前项和后项】
# 处理规则数据
result <- read.csv("./tmp/rules.csv", stringsAsFactors = FALSE)
# 将规则拆开
meal.recom <- strsplit(result$rules, "=>")

# 去除中括号
lhs <- 0
rhs <- 0
for (i in 1:length(meal.recom)) {
  lhs[i] <- gsub("[{|}+\n]|\\s", "", meal.recom[[i]][1])
  rhs[i] <- gsub("[{|}+\n]|\\s", "", meal.recom[[i]][2])
}

rules.new <- data.frame(lhs = lhs, rhs = rhs, support = result$support,
                        confidence = result$confidence, lift = result$lift)

write.csv(rules.new, "./tmp/rules_new.csv", row.names = FALSE)  # 写出数据

# 【进行综合得分】
# 计算综合评分
# 读取数据
rules.new <- read.csv("./tmp/rules_new.csv", stringsAsFactors = FALSE)
sales_volume <- read.csv("./tmp/sales_volume.csv", stringsAsFactors = FALSE)
profit <- read.csv("./tmp/profit.csv", stringsAsFactors = FALSE)
dish <- read.csv("./data/meal_dishes_detail.csv", stringsAsFactors = FALSE)

# 统计前项
rules.count <- as.data.frame(table(rules.new$lhs))
rules.count <- rules.count[order(rules.count$Freq, decreasing = TRUE), ]

# 提取前项为“芹菜炒腰花,孜然羊排”的数据,对推荐的菜品进行综合评分
# 计算每个菜所推荐的菜的综合评分
# 设A的权重a1 = 1.5, a2 = 2.5, a3 = 2, a4 = 4
A <- matrix(c(0, 2.5, 2, 4, 
              1.5, 0, 2, 4,
              1.5, 2.5, 0, 4,
              1.5, 2.5, 2, 0), 4, 4, byrow = T)
E <- c(1, 1, 1, 1)

# 初始化
rules.new$sales <- 0  # 热销度
rules.new$recommendation <- 0  # 主推度
rules.new$profit <- 0  # 毛利率
rules.new$mark <- 0  # 综合评分

for (i in 1:nrow(rules.new)) {
  # 找到对应的热销度
  sales.num <- which(sales_volume$dishes_name == rules.new$rhs[i])
  rules.new$sales[i] <- sales_volume$sales_hot[sales.num]
  
  # 找到对应的毛利率和主推度
  profit.num <- which(profit$dishes_name == rules.new$rhs[i])
  rules.new$profit[i] <- profit$rate[profit.num]
  rules.new$recommendation[i] <- profit$recommend_percent[profit.num]
  
  # 计算综合评分
  Y <- c(rules.new$sales[i], rules.new$recommendation[i], 
         rules.new$profit[i], rules.new$confidence[i])
  rules.new$mark[i] <- round((E - Y) %*% A %*% t(t(Y)), 3)
}

# 对综合评分进行排序
rules.new <- rules.new[order(rules.new$mark, decreasing = TRUE), ]

write.csv(rules.new, "./tmp/recommend.csv", row.names = FALSE)  # 写出数据


# 选取后项为"芹菜炒腰花" 的数据
rules.item <- rules.new[which(rules.new$rhs == "芹菜炒腰花"), ]
write.csv(rules.item, "./tmp/rules_item.csv", row.names = FALSE)

使用K-means算法进行客户价值分析

# 【数据预处理】
# 设置工作目录并读取数据
setwd()
info <- read.csv("./data/meal_order_info.csv", stringsAsFactors = FALSE)
users <- read.csv("./data/users.csv", stringsAsFactors = FALSE)

# 数据预处理
info <- info[which(info$order_status == 1), ]  # 提取有效订单

# 对info的时间列按用户的ID去重
info_time <- info[, c("emp_id", "use_start_time")]
library(plyr)
info_time <- ddply(info_time, .(emp_id), tail, n = 1)

# 匹配用户的最后一次用餐时间
for (i in 1:nrow(info)) {
  num <- which(users$USER_ID == info$emp_id[i])
  users[num, "LAST_VISITS"] <- info$use_start_time[i]
}

user <- users[-which(users$LAST_VISITS == ""), c(1, 3, 13, 15)]  # 特征选取

# 构建RFM特征
# 构建F特征
user.value1 <- as.data.frame(table(info$emp_id))  # 统计每个人的用餐次数
colnames(user.value1) <- c("USER_ID", "F")  # 修改列名

# 构建M特征
user.value2 <- aggregate(info[, "expenditure"], list(info$emp_id), FUN = 'sum')
colnames(user.value2) <- c("USER_ID", "M")
user.value <- merge(user.value1, user.value2, by = c("USER_ID"))  # 合并两个表

# 构建R特征
user.value <- merge(user.value, user, by = c("USER_ID"))  # 合并两个表
# 转换时间格式
last_time <- as.Date(user.value$LAST_VISITS, "%Y/%m/%d")
finally <- as.Date("2016-8-31")  # 观测窗口结束时间

user.value$R <- as.numeric(difftime(finally, last_time, units = "days"))

user.value <- user.value[, c(1,4,7,2,3)]  # 特征提取
write.csv(user.value, "./tmp/user_value.csv", row.names = FALSE)

# 【构建模型】
# 确定类数
user.value <- read.csv("./tmp/user_value.csv", stringsAsFactors = FALSE)
USER_ID <- user.value$USER_ID
ACCOUNT <- user.value$ACCOUNT
user.value <- user.value[, -c(1,2)]

# 标准化数据
standard <- scale(user.value)  # 数据标准化
write.csv(standard, './tmp/standard.csv', row.names = FALSE)  # 写出数据

# 求组间距离平方和与总体距离平方和的比值(betweenss / totss,越接近1越好)
BT <- 0
for(i in 1:10){
  model <- kmeans(user.value, centers = i)
  BT[i] <- model$betweenss / model$totss
}
plot(1:10, BT, type = "b", xlab = "聚类数", ylab = "组间平方和/总体距离平方和")

# 构建模型
set.seed(123)
result <- kmeans(standard, 3)

result$center  # 查看聚类中心值
result$size  # 查看每一类的个数

# 导出聚类后的数据
users.class <- cbind(USER_ID, ACCOUNT, user.value, class = result$cluster)
write.csv(users.class, "./tmp/users_class.csv", row.names = FALSE)

# 每一簇各指标的关系程度--雷达图
library(fmsb)
max <- apply(result$centers, 2, max)
min <- apply(result$centers, 2, min)
radar <- data.frame(rbind(max, min, result$centers))
radarchart(radar, pty = 32, plty = c(1:3), plwd = 4, vlcex = 1.2)  # 画雷达图
# 给雷达图加图例
L <- 1.2
for(i in 1:3){
  legend(1.0, L, legend = paste("客户群", i), lty = i, lwd = 3, col = i, bty = "n")
  L <- L - 0.3
}

用决策树算法实现餐饮客户流失预测

# 【合并客户信息表和订单表】
# 设置工作目录
setwd()
# 合并两个表
# 读取数据
users <- read.csv("./data/user_loss.csv", stringsAsFactors = FALSE)
info <- read.csv("./data/info_new.csv", stringsAsFactors = FALSE)

# 将时间转为时间格式
library(lubridate)
info$use_start_time <- parse_date_time(info$use_start_time, orders = "YmdHMS")
# info$lock_time <- parse_date_time(info$lock_time, orders = "YmdHMS")
# users$CREATED <- parse_date_time(users$CREATED, orders = "YmdHMS")

# 对info的时间列按用户的ID去重
info_time <- info[, c("emp_id", "use_start_time")]
library(plyr)
info_time <- ddply(info_time, .(emp_id), tail, n = 1)

# 匹配用户的最后一次用餐时间
for (i in 1:nrow(users)) {
  info1 <- info[which(info$name == users$ACCOUNT[i]), ]  # 提取某用户的订单数据
  if(nrow(info1) >= 1){
    info1 <- info1[order(info1$use_start_time), ]
    users[i, "LAST_VISITS"] <- as.character(info1$use_start_time[nrow(info1)])
  }
}

# 特征选取
user <- users[, c(1, 3, 15, 38)]
info <- info[which(info$order_status == 1), c(2,3,7)]  # 提取有效订单

names(info)[1] <- "USER_ID"  # 修改列名

info.user <- merge(user, info, by = "USER_ID")  # 合并两个表
write.csv(info.user, "./tmp/info_user.csv", row.names = FALSE)

# 【构建特征】
info.user <- read.csv("./tmp/info_user.csv", stringsAsFactors = FALSE)

# 提取info表的用户名和用餐时间,并按人名对用餐人数和金额进行分组求和
info.user1 <- as.data.frame(table(info.user$USER_ID))  # 统计每个人的用餐次数
colnames(info.user1) <- c("USER_ID", "frequence")  # 修改列名

# 求出每个人的消费总金额
info.user2 <- aggregate(info.user[, c(("number_consumers"), ("expenditure"))], 
                        list(info.user$USER_ID), FUN = "sum")  # 分组求和
colnames(info.user2) <- c("USER_ID", "numbers", "amount")
info.user.new <- merge(info.user1, info.user2, by = c("USER_ID"))  # 合并两个表

# 对合并后的数据进行处理
info.user <- info.user[, c(1:4)]
library(plyr)
info.user <- ddply(info.user, .(USER_ID, LAST_VISITS, type), tail, n = 1)

info.user.new <- merge(info.user.new, info.user, by = "USER_ID")  # 合并两个表


# 求平均消费金额,并保留2为小数
info.user.new$average <- round(info.user.new$amount / info.user.new$numbers, 2)
# 计算每个客户最近一次点餐的时间距离观测窗口结束的天数
# 修改时间列,改为日期
info.user.new$LAST_VISITS <- as.Date(info.user.new$LAST_VISITS)

datefinally <- as.Date("2016-7-31")  # 观测窗口结束时间

info.user.new$recently  <- difftime(datefinally, info.user.new$LAST_VISITS, 
                                    units = "days")  # 计算时间差
info.user.new$recently  <- as.numeric(info.user.new$recently )  # 转为数值型

info.user.new <- info.user.new[, c(1,5,2,4,8,9,7)]  # 特征选取
write.csv(info.user.new, "./tmp/info_user_clear.csv", row.names = FALSE)

# 【构建决策树模型并评价】
# 划分测试集、训练集
info.user <- read.csv("./tmp/info_user_clear.csv", stringsAsFactors = FALSE)

# 删除流失用户
info.user <- info.user[-which(info.user$type == "已流失"), ]

model.data <- info.user[, c(3:7)]
model.data$type <- as.factor(model.data$type)  # 将类标号转换为因子型

set.seed(12345)  # 设置随机种子
ind <- sample(2, nrow(model.data), replace = TRUE, prob = c(0.8, 0.2))
train <- model.data[ind == 1, ]
test <- model.data[ind == 2, ]

# 查看样本分布
table(train$type)
table(test$type)


# 构建决策树模型
library(rpart)
# method="class"为分类树;
# parms = list(split = "information")为选择信息熵计算纯度;
# xval设置交叉验证次数;minsplit设置最小分割;cp设置剪枝率。
rpart <- rpart(type ~ ., train, method = "class",
               parms = list(split = "information"),
               control=rpart.control(xval = 10, minsplit = 20, cp = 0.01))

# 画树
library(rattle)
fancyRpartPlot(rpart, cex = 0.7)  # 画决策图彩色图
asRules(rpart)  # 导出决策规则

#  测试集预测
pre.rpart <- predict(rpart, test, type = "class")  # 预测
table.pre <- table(test$type, pre.rpart)  # 混淆矩阵

(P <- table.pre[2, 2] / sum(table.pre[2, ]))  # 精确率
(R <- table.pre[2, 2] / sum(table.pre[, 2]))  # 召回率
(F1 <- 2 * P * R / (P + R))
  • 0
    点赞
  • 19
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值