IncomesESL Analy

最新推荐文章于 2022-07-24 15:33:16 发布

闲看窗外雨丶

最新推荐文章于 2022-07-24 15:33:16 发布

阅读量467

点赞数

分类专栏： R语言文章标签：数据挖掘 r语言关联规则分类

本文链接：https://blog.csdn.net/qq_31942317/article/details/78657983

版权

R语言专栏收录该内容

5 篇文章 0 订阅

订阅专栏

本文章自学Rmarkdown+arule包

读入数据

# 读取数据
Data <- read.csv("d://Rtest/数据挖掘组考核/IncomeESL_deal.csv")
# 观测数据
head(Data,10)

数据处理

# 数据处理
library(stringr)
# 1.收入分布观测
table(Data$income)
# 连续变量转化为离散变量，
Data$income <- str_extract_all(Data$income,"\\d+")
for(i in 1:length(Data$income)){
  Data$income[i] <- Data$income[[i]][1]
}
for(i in 1:length(Data$income)){
  if(Data$income[i]==0){
    Data$income[i] <- 5 #其他区间取最低收入，0-10取5
  } 
}
Data$income <- unlist(Data$income)
Data$income <- str_c(Data$income,"","k")
# 观测调整后数据
table(Data$income)

# 2.婚姻状态观测
table(Data$marital.status)
# 多分类转化为二分类（将同居,结婚归为结婚,其他转化为other)
Data$marital.status <- str_replace_all(Data$marital.status,"cohabitation","married")
Data$marital.status <- str_replace_all(Data$marital.status,"divorced","not marry")
Data$marital.status <- str_replace_all(Data$marital.status,"single","not marry")
Data$marital.status <- str_replace_all(Data$marital.status,"widowed","not marry")
# 观测调整后数据
table(Data$marital.status)

# 3.年龄分布观测
table(Data$age)
# 取最低年龄
Data$age <- str_extract_all(Data$age,"\\d+")
for(i in 1:length(Data$age)){
  Data$age[i] <- Data$age[[i]][1]
}
# 转为非list格式
Data$age <- unlist(Data$age)
Data$age <- str_c(Data$age,"","year")
# 观测调整后数据
table(Data$age)

# 4.教育年限观测
table(Data$education)
# 多分类转化为二分类(将大学及以上归为higg,其他转化为low)
Data$education <- str_replace_all(Data$education,"college.*","high graduate")
Data$education <- str_replace_all(Data$education,"college graduate","high graduate")
Data$education <- str_replace_all(Data$education,"graduate study","high graduate")
Data$education <- str_replace_all(Data$education,"grade <9","low graduate")
Data$education <- str_replace_all(Data$education,"grades 9-11","low graduate")
Data$education <- str_replace_all(Data$education,"high school graduate","low graduate")
# 观测调整后数据
table(Data$education)

# 5.职业分布观测
table(Data$occupation)
# 多分类转化为二分类(分为有收入来源和无收入来源)
Data$occupation <- str_replace_all(Data$occupation,"clerical/service","income")
Data$occupation <- str_replace_all(Data$occupation,"homemaker","not income")
Data$occupation <- str_replace_all(Data$occupation,"laborer","income")
Data$occupation <- str_replace_all(Data$occupation,"military","income")
Data$occupation <- str_replace_all(Data$occupation,"professional/managerial","income")
Data$occupation <- str_replace_all(Data$occupation,"retired","not income")
Data$occupation <- str_replace_all(Data$occupation,"sales","income")
Data$occupation <- str_replace_all(Data$occupation,"student","not income")
Data$occupation <- str_replace_all(Data$occupation,"unemployed","not income")
# 观测调整后数据
table(Data$occupation)

# 6.居住在旧金山湾区的年限
table(Data$years.in.bay.area)
# 多分类转化为二分类(分为10年以上的为livelong,其他为livehort)
Data$years.in.bay.area <- str_replace_all(Data$years.in.bay.area,">10","livelong")
Data$years.in.bay.area <- str_replace_all(Data$years.in.bay.area,"2017.*","liveshort")
Data$years.in.bay.area <- str_replace_all(Data$years.in.bay.area,"<1","liveshort")

# 7.双份收入观测
table(Data$dual.incomes)
# 多分类转化为二分类(分为yes,no)
Data$dual.incomes <- str_replace_all(Data$dual.incomes,"no$","no dual incomes")
Data$dual.incomes <- str_replace_all(Data$dual.incomes,"not married","no dual incomes")
Data$dual.incomes <- str_replace_all(Data$dual.incomes,"yes","dual incomes")
# 观测调整后数据
table(Data$dual.incomes)

# 8.家庭人口
Data$number.in.household <- str_replace_all(Data$number.in.household,"9.","9")
Data$number.in.household <- str_c(Data$number.in.household,"","hoursehold")
# 观测调整后数据
table(Data$number.in.household)

# 9.小孩人口
Data$number.of.children <- str_replace_all(Data$number.of.children,"9.","9")
Data$number.of.children <- str_c(Data$number.of.children,"","child")
# 观测调整后数据
table(Data$number.of.children)


# 10.居住状况
table(Data$householder.status)
# 多分类转化为二分类(分为own,rent[与家人住也分成了own])
Data$householder.status <- str_replace_all(Data$householder.status,"live with parents/family","own")
# 观测调整后数据
table(Data$householder.status)

# 11.住房类型
table(Data$type.of.home)
# 多分类转化为二分类(house为house，其他分为other house)
Data$type.of.home <- str_replace_all(Data$type.of.home,"other","other house")
Data$type.of.home <- str_replace_all(Data$type.of.home,"apartment","other house")
Data$type.of.home <- str_replace_all(Data$type.of.home,"condominium","other house")
Data$type.of.home <- str_replace_all(Data$type.of.home,"mobile Home","other house")
# 观测调整后数据
table(Data$type.of.home)

# 12.种族
table(Data$ethnic.classification)
# 分为white和其他
Data$ethnic.classification <- str_replace_all(Data$ethnic.classification,"other","other race")
Data$ethnic.classification <- str_replace_all(Data$ethnic.classification,"american indian","other race")
Data$ethnic.classification <- str_replace_all(Data$ethnic.classification,"asian","other race")
Data$ethnic.classification <- str_replace_all(Data$ethnic.classification,"black","other race")
Data$ethnic.classification <- str_replace_all(Data$ethnic.classification,"east indian","other race")
Data$ethnic.classification <- str_replace_all(Data$ethnic.classification,"hispanic","other race")
Data$ethnic.classification <- str_replace_all(Data$ethnic.classification,"pacific islander","other race")
# 观测调整后数据
table(Data$ethnic.classification)

# 13.语言
table(Data$language.in.home)
# 分为英语或其他
Data$language.in.home <- str_replace_all(Data$language.in.home,"other","other language")
Data$language.in.home <- str_replace_all(Data$language.in.home,"spanish","other language")
# 观测调整后数据
table(Data$language.in.home)

Data <- as.data.frame(Data)
head(Data)
write.table(Data,"d://Rtest/数据挖掘/数据挖掘考核_deal.csv",row.names = F,sep = ",")

数据关联规则挖掘

library(Matrix)
library(arules)
library(arulesViz)
data <- read.transactions(
  "d://Rtest/数据挖掘/数据挖掘考核_deal.csv", 
  format="basket", # 一个购物篮一行记录
  sep=",", 
  skip = 1
)

#不同支持度下的探索sup=0.2
rules <- apriori(
  data, 
  parameter=list(
    support=0.2, 
    confidence=0.5
  )
)
inspect(sort(rules,by="support")[1:10])    # 按支持度查看前10条规则
inspect(sort(rules,by="confidence")[1:10]) # 按置信度查看前10条规则


# 不同支持度下的探索sup=0.1
rules <- apriori(
  data, 
  parameter=list(
    support=0.1, 
    confidence=0.5
  )
)

inspect(sort(rules,by = "support")[1:10])    # 按支持度查看前10条规则
inspect(sort(rules,by = "confidence")[1:10]) # 按置信度查看前10条规则

# 作图
plot(rules, shading="order", control=list(main = "Two-key plot")) # 见chart.1
plot(rules, method="grouped") # 见chart.2
plot(rules, method="graph") # 见chart.3

结论

当lift大于1时说明使用规则是有效的。
由上表可知:
1.白人更多的使用的使用的是英语。
2.没有收入或没有高学历的,没有双份收入
3.没有结婚,租房子的,没有双份收入。
4.0-10K收入,没有结婚的，没有双份收入。
5.居住很短,没有结婚，没有双份收入。~PS:老美：屌丝是不配结婚的T.T~
原谅我一本正经的胡说一顿分析结果,关联规则还是得继续努力学习。
路漫漫其修远兮

面对R语言的疾风吧!!!!!

闲看窗外雨丶

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
*IncomesESL Analy*

本文章自学Rmarkdown+arule包读入数据# 读取数据Data <- read.csv("d://Rtest/数据挖掘组考核/IncomeESL_deal.csv")# 观测数据head(Data,10)数据处理# 数据处理library(stringr)# 1.收入分布观测table(Data$income)# 连续变量转化为离散变量，Data$income <- str_
复制链接

扫一扫