# 导入所需的包
library(gbm)
library(ggplot2)
library(ada)
library(class)
library(MASS)
library(caret)
load_data = function(file_path) {
names = c('state', 'acc_length', 'area', 'ph_num', 'inter_plan', 'vm_plan', 'num_vm_message', 'day_min',
'day_calls', 'day_charge', 'eve_min', 'eve_calls', 'eve_charge', 'night_min', 'night_calls',
'night_charge', 'inter_min', 'inter_calls', 'inter_charge', 'cus_ser_calls', 'churn')
data = read.csv(file_path, header = TRUE)
colnames(data) = names
return(data)
}
check_data_feature = function(data) {
print(dim(data))
print(str(data))
print(table(data$churn))
}
draw_feature_plot = function(data){
par(mfrow = c(1,2))
barplot(table(data$churn), col = "skyblue", main = "churn True or False", las = 1)
barplot(table(data$cus_ser_calls),
col = "skyblue", main = "customer service calls times", las = 1)
par(mfrow = c(4, 3), mar = c(3, 3, 2, 1))
name <- c('day_min', 'day_calls', 'day_charge', 'eve_min', 'eve_calls', 'eve_charge',
'night_min', 'night_calls', 'night_charge', 'inter_min', 'inter_calls', 'inter_charge')
for (i in seq_along(name)) {
numdata = as.numeric(data[[name[i]]])
hist(numdata, main = paste('Density of', name[i]), xlab = name[i], col = 'skyblue',
border = 'black', probability = TRUE)
lines(density(numdata), col = 'red', lwd = 2)
}
}
feature_associated = function(data){
inter_plan_counts <- table(data$inter_plan, data$churn)
df_inter <- as.data.frame(inter_plan_counts)
names(df_inter) <- c('inter_plan', 'churn', 'count')
p1 <- ggplot(df_inter, aes(x = inter_plan, y = count, fill = churn)) +
geom_bar(stat = 'identity', position = 'stack') +
labs(title = 'Inter or No Inter of Churn', x = 'Inter or Not Inter', y = 'Number')
cus_calls_counts <- table(data$cus_ser_calls, data$churn)
df_cus <- as.data.frame(cus_calls_counts)
names(df_cus) <- c('cus_ser_calls', 'churn', 'count')
p2 <- ggplot(df_cus, aes(x = cus_ser_calls, y = count, fill = churn)) +
geom_bar(stat = 'identity', position = 'stack') +
labs(title = 'Customer Service Calls about Churn', x = 'Customer Service Calls', y = 'Numbers') +
theme_minimal()
print(p1)
print(p2)
}
deal_data <- function(data) {
# 分离目标变量
y <- ifelse(data$churn == "True", 1, 0)
# 将分类变量转换为虚拟变量
new_inter <- model.matrix(~inter_plan - 1, data)
new_vm_plan <- model.matrix(~vm_plan - 1, data)
# 合并数据
data_temp <- cbind(data, new_inter, new_vm_plan)
# 删除无用的特征
to_drop <- c('state', 'area', 'ph_num', 'inter_plan', 'vm_plan', 'churn')
data_df <- data_temp[, !(names(data_temp) %in% to_drop)]
# 标准化特征数据
X <- scale(data_df)
return(list(X = X, y = y))
}
choose_algorithm = function(X, y){
set.seed(1)
train_index <- sample(3333, 2333)
# 从全体数据中选择训练集和测试集
X_train <- X[train_index, ]
X_test <- X[-train_index, ]
y_train <- y[train_index]
y_test <- y[-train_index]
# 逻辑回归
fit_lr <- glm(y_train ~ ., data = as.data.frame(X_train), family = binomial)
pred_test_lr <- predict(fit_lr, newdata = as.data.frame(X_test), type = "response") > 0.5
conf_matrix_test_lr <- table(predicted = pred_test_lr, Actual = y_test)
Accuracy_test_lr <- sum(diag(conf_matrix_test_lr)) / sum(conf_matrix_test_lr)
# 线性判别分析
fit_lda <- lda(y_train ~ ., data = as.data.frame(X_train))
pred_test_lda <- predict(fit_lda, newdata = as.data.frame(X_test))$class
conf_matrix_test_lda <- table(predicted = pred_test_lda, Actual = y_test)
Accuracy_test_lda <- sum(diag(conf_matrix_test_lda)) / sum(conf_matrix_test_lda)
# K最近邻
fit_knn <- knn(X_train, X_test, cl = y_train, k = 3)
conf_matrix_test_knn <- table(predicted = fit_knn, Actual = y_test)
Accuracy_test_knn <- sum(diag(conf_matrix_test_knn)) / sum(conf_matrix_test_knn)
# 输出每个模型的测试集准确率
cat("Logistic Regression Accuracy:", Accuracy_test_lr, "\n")
cat("Linear Discriminant Analysis Accuracy:", Accuracy_test_lda, "\n")
cat("K-Nearest Neighbors Accuracy:", Accuracy_test_knn, "\n")
# 创建一个数据框,包含模型名称和准确率
df <- data.frame(Model = c("LR", "LDA", "KNN(k=3)"),
Accuracy = c(Accuracy_test_lr, Accuracy_test_lda, Accuracy_test_knn))
# 使用 ggplot2 绘制箱线图
ggplot(df, aes(x = Model, y = Accuracy)) +
geom_boxplot(fill = "lightblue", color = "darkblue") +
geom_point(size = 3, color = "red") +
labs(title = "Model Accuracies on Test Set", y = "Accuracy") +
theme_minimal()
}
improve_result = function(X, y){
set.seed(1)
train_index <- sample(3333, 2333)
# 从全体数据中选择训练集和测试集
X_train <- X[train_index, ]
X_test <- X[-train_index, ]
y_train <- y[train_index]
y_test <- y[-train_index]
# 使用 gbm 进行梯度提升
gb_model <- gbm(y_train ~ ., data = as.data.frame(X_train), distribution = "bernoulli", n.trees = 100, interaction.depth = 1, shrinkage = 0.1)
# 在训练集上的预测
prob_train <- predict(gb_model, newdata = as.data.frame(X_train), n.trees = 100, type = "response")
pred_train <- prob_train > 0.5
# 在测试集上的预测
prob_test <- predict(gb_model, newdata = as.data.frame(X_test), n.trees = 100, type = "response")
pred_test <- prob_test > 0.5
# 创建混淆矩阵并计算准确率
conf_matrix_train <- table(predicted = pred_train, Actual = y_train)
accuracy_train <- sum(diag(conf_matrix_train)) / sum(conf_matrix_train)
conf_matrix_test <- table(predicted = pred_test, Actual = y_test)
accuracy_test <- sum(diag(conf_matrix_test)) / sum(conf_matrix_test)
cat("Gradient Boosting训练集准确率:", accuracy_train, "\n")
cat("Gradient Boosting测试集准确率:", accuracy_test, "\n")
# 使用 ada 进行训练
ada_model <- ada(y_train ~ ., data = as.data.frame(cbind(y_train, X_train)), iter = 100)
# 在训练集上的预测
pred_train_ada <- predict(ada_model, newdata = as.data.frame(X_train))
# 在测试集上的预测
pred_test_ada <- predict(ada_model, newdata = as.data.frame(X_test))
# 创建混淆矩阵并计算准确率
conf_matrix_train_ada <- table(predicted = pred_train_ada, Actual = y_train)
accuracy_train_ada <- sum(diag(conf_matrix_train_ada)) / sum(conf_matrix_train_ada)
conf_matrix_test_ada <- table(predicted = pred_test_ada, Actual = y_test)
accuracy_test_ada <- sum(diag(conf_matrix_test_ada)) / sum(conf_matrix_test_ada)
cat("AdaBoost训练集准确率:", accuracy_train_ada, "\n")
cat("AdaBoost测试集准确率:", accuracy_test_ada, "\n")
}
hunxiao = function(){
# 创建一个简单的混淆矩阵数据框
conf_matrix_data <- data.frame(
Actual = rep(c("0", "1"), each = 2),
Predicted = rep(c("0", ""), times = 2),
Value = c(869, 34, 4, 103) # 修改为新的混淆矩阵的实际值
)
# 使用 ggplot2 创建混淆矩阵图
ggplot(conf_matrix_data, aes(x = Actual, y = Predicted, fill = Value)) +
geom_tile() +
geom_text(aes(label = Value), vjust = 1) +
scale_fill_gradient(low = "lightblue", high = "darkblue") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
axis.title = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
legend.position = "none"
) +
labs(title = "Custom Confusion Matrix")
}
main = function(){
file_path = "C:/Users/27128/Desktop/R_project/R_P/data.csv"
data = load_data(file_path)
check_data_feature(data)
draw_feature_plot(data)
feature_associated(data)
result <- deal_data(data)
X <- result$X
y <- result$y
choose_algorithm(X, y)
improve_result(X, y)
hunxiao()
}
main()
基于逻辑回归、GBDT、AdaBoost模型的客户流失预测(1)
最新推荐文章于 2024-11-01 17:13:19 发布