<font face="微软雅黑" color=DodgerBlue>*IncomesESL Analy*</font>

本文章自学Rmarkdown+arule包

读入数据

# 读取数据
Data <- read.csv("d://Rtest/数据挖掘组考核/IncomeESL_deal.csv")
# 观测数据
head(Data,10)

数据处理

# 数据处理
library(stringr)
# 1.收入分布观测
table(Data$income)
# 连续变量转化为离散变量,
Data$income <- str_extract_all(Data$income,"\\d+")
for(i in 1:length(Data$income)){
  Data$income[i] <- Data$income[[i]][1]
}
for(i in 1:length(Data$income)){
  if(Data$income[i]==0){
    Data$income[i] <- 5 #其他区间取最低收入,0-10取5
  } 
}
Data$income <- unlist(Data$income)
Data$income <- str_c(Data$income,"","k")
# 观测调整后数据
table(Data$income)

# 2.婚姻状态观测
table(Data$marital.status)
# 多分类转化为二分类(将同居,结婚归为结婚,其他转化为other)
Data$marital.status <- str_replace_all(Data$marital.status,"cohabitation","married")
Data$marital.status <- str_replace_all(Data$marital.status,"divorced","not marry")
Data$marital.status <- str_replace_all(Data$marital.status,"single","not marry")
Data$marital.status <- str_replace_all(Data$marital.status,"widowed","not marry")
# 观测调整后数据
table(Data$marital.status)

# 3.年龄分布观测
table(Data$age)
# 取最低年龄
Data$age <- str_extract_all(Data$age,"\\d+")
for(i in 1:length(Data$age)){
  Data$age[i] <- Data$age[[i]][1]
}
# 转为非list格式
Data$age <- unlist(Data$age)
Data$age <- str_c(Data$age,"","year")
# 观测调整后数据
table(Data$age)

# 4.教育年限观测
table(Data$education)
# 多分类转化为二分类(将大学及以上归为higg,其他转化为low)
Data$education <- str_replace_all(Data$education,"college.*","high graduate")
Data$education <- str_replace_all(Data$education,"college graduate","high graduate")
Data$education <- str_replace_all(Data$education,"graduate study","high graduate")
Data$education <- str_replace_all(Data$education,"grade <9","low graduate")
Data$education <- str_replace_all(Data$education,"grades 9-11","low graduate")
Data$education <- str_replace_all(Data$education,"high school graduate","low graduate")
# 观测调整后数据
table(Data$education)

# 5.职业分布观测
table(Data$occupation)
# 多分类转化为二分类(分为有收入来源和无收入来源)
Data$occupation <- str_replace_all(Data$occupation,"clerical/service","income")
Data$occupation <- str_replace_all(Data$occupation,"homemaker","not income")
Data$occupation <- str_replace_all(Data$occupation,"laborer","income")
Data$occupation <- str_replace_all(Data$occupation,"military","income")
Data$occupation <- str_replace_all(Data$occupation,"professional/managerial","income")
Data$occupation <- str_replace_all(Data$occupation,"retired","not income")
Data$occupation <- str_replace_all(Data$occupation,"sales","income")
Data$occupation <- str_replace_all(Data$occupation,"student","not income")
Data$occupation <- str_replace_all(Data$occupation,"unemployed","not income")
# 观测调整后数据
table(Data$occupation)

# 6.居住在旧金山湾区的年限
table(Data$years.in.bay.area)
# 多分类转化为二分类(分为10年以上的为livelong,其他为livehort)
Data$years.in.bay.area <- str_replace_all(Data$years.in.bay.area,">10","livelong")
Data$years.in.bay.area <- str_replace_all(Data$years.in.bay.area,"2017.*","liveshort")
Data$years.in.bay.area <- str_replace_all(Data$years.in.bay.area,"<1","liveshort")

# 7.双份收入观测
table(Data$dual.incomes)
# 多分类转化为二分类(分为yes,no)
Data$dual.incomes <- str_replace_all(Data$dual.incomes,"no$","no dual incomes")
Data$dual.incomes <- str_replace_all(Data$dual.incomes,"not married","no dual incomes")
Data$dual.incomes <- str_replace_all(Data$dual.incomes,"yes","dual incomes")
# 观测调整后数据
table(Data$dual.incomes)

# 8.家庭人口
Data$number.in.household <- str_replace_all(Data$number.in.household,"9.","9")
Data$number.in.household <- str_c(Data$number.in.household,"","hoursehold")
# 观测调整后数据
table(Data$number.in.household)

# 9.小孩人口
Data$number.of.children <- str_replace_all(Data$number.of.children,"9.","9")
Data$number.of.children <- str_c(Data$number.of.children,"","child")
# 观测调整后数据
table(Data$number.of.children)


# 10.居住状况
table(Data$householder.status)
# 多分类转化为二分类(分为own,rent[与家人住也分成了own])
Data$householder.status <- str_replace_all(Data$householder.status,"live with parents/family","own")
# 观测调整后数据
table(Data$householder.status)

# 11.住房类型
table(Data$type.of.home)
# 多分类转化为二分类(house为house,其他分为other house)
Data$type.of.home <- str_replace_all(Data$type.of.home,"other","other house")
Data$type.of.home <- str_replace_all(Data$type.of.home,"apartment","other house")
Data$type.of.home <- str_replace_all(Data$type.of.home,"condominium","other house")
Data$type.of.home <- str_replace_all(Data$type.of.home,"mobile Home","other house")
# 观测调整后数据
table(Data$type.of.home)

# 12.种族
table(Data$ethnic.classification)
# 分为white和其他
Data$ethnic.classification <- str_replace_all(Data$ethnic.classification,"other","other race")
Data$ethnic.classification <- str_replace_all(Data$ethnic.classification,"american indian","other race")
Data$ethnic.classification <- str_replace_all(Data$ethnic.classification,"asian","other race")
Data$ethnic.classification <- str_replace_all(Data$ethnic.classification,"black","other race")
Data$ethnic.classification <- str_replace_all(Data$ethnic.classification,"east indian","other race")
Data$ethnic.classification <- str_replace_all(Data$ethnic.classification,"hispanic","other race")
Data$ethnic.classification <- str_replace_all(Data$ethnic.classification,"pacific islander","other race")
# 观测调整后数据
table(Data$ethnic.classification)

# 13.语言
table(Data$language.in.home)
# 分为英语或其他
Data$language.in.home <- str_replace_all(Data$language.in.home,"other","other language")
Data$language.in.home <- str_replace_all(Data$language.in.home,"spanish","other language")
# 观测调整后数据
table(Data$language.in.home)

Data <- as.data.frame(Data)
head(Data)
write.table(Data,"d://Rtest/数据挖掘/数据挖掘考核_deal.csv",row.names = F,sep = ",")

数据关联规则挖掘

library(Matrix)
library(arules)
library(arulesViz)
data <- read.transactions(
  "d://Rtest/数据挖掘/数据挖掘考核_deal.csv", 
  format="basket", # 一个购物篮一行记录
  sep=",", 
  skip = 1
)

#不同支持度下的探索sup=0.2
rules <- apriori(
  data, 
  parameter=list(
    support=0.2, 
    confidence=0.5
  )
)
inspect(sort(rules,by="support")[1:10])    # 按支持度查看前10条规则
inspect(sort(rules,by="confidence")[1:10]) # 按置信度查看前10条规则


# 不同支持度下的探索sup=0.1
rules <- apriori(
  data, 
  parameter=list(
    support=0.1, 
    confidence=0.5
  )
)

inspect(sort(rules,by = "support")[1:10])    # 按支持度查看前10条规则
inspect(sort(rules,by = "confidence")[1:10]) # 按置信度查看前10条规则

# 作图
plot(rules, shading="order", control=list(main = "Two-key plot")) # 见chart.1
plot(rules, method="grouped") # 见chart.2
plot(rules, method="graph") # 见chart.3

结论

当lift大于1时说明使用规则是有效的。
由上表可知:
1.白人更多的使用的使用的是英语。
2.没有收入或没有高学历的,没有双份收入
3.没有结婚,租房子的,没有双份收入。
4.0-10K收入,没有结婚的,没有双份收入。
5.居住很短,没有结婚,没有双份收入。~PS:老美:屌丝是不配结婚的T.T~
原谅我一本正经的胡说一顿分析结果,关联规则还是得继续努力学习。
路漫漫其修远兮

<font face="STCAIYUN" color=gray size=5>面对R语言的疾风吧!!!!!</font>

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值