面板数据清理遇到的问题(全)

原始数据表,都是面板数据

目标数据表:

 

 

 

 

 

setwd("c:/users/11565/Desktop/合并二/可以用数据")
sentiment<-read.csv("001.csv")
names(sentiment)
dim(sentiment)
attach(sentiment)
sentiment$date<-as.character(sentiment$date)#转换成字符串
#删除含有确认的噪声数据(如确认收货之类),重复六次可以全部删除
for (j in 1:6){ 
  for (i in (1:length(sentiment$c_bianhao))){
    if ("确" %in% unlist(strsplit(sentiment$date[i], "")) == T)
      sentiment=sentiment[-i,]
    else{
      sentiment[i,]=sentiment[i,]
    }
  }
}
dim(sentiment)
table(sentiment$date)
sentiment<-sentiment[c(1:31600),]
detach()

#对sentiment_value1进行分类计算
for (i in (1:length(sentiment$c_bianhao)))
{
  if (sentiment_value1[i]>=0.7)
  {sentiment$critic1[i]=1
  sentiment$class1[i]="好评"}
  if (sentiment_value1[i]<0.7 & sentiment_value1[i]>0.3)
  {sentiment$critic1[i]=2
  sentiment$class1[i]="中评"}
  if (sentiment_value1[i]<=0.3 && sentiment_value1[i]>0)
  {sentiment$critic1[i]=-1
  sentiment$class1[i]="差评"} 
}
#对sentiment_value2进行分类计算
for (i in (1:length(sentiment$c_bianhao)))
{
  if (sentiment_value2[i]>=0.7){
    sentiment$critic2[i]=1
    sentiment$class2[i]="好评"}
  if (sentiment_value2[i]<0.7 & sentiment_value2[i]>0.3)
  {sentiment$critic2[i]=2
  sentiment$class2[i]="中评"}
  if (sentiment_value2[i]<=0.3 & sentiment_value2[i]>=0)
  {sentiment$critic2[i]=-1
  sentiment$class2[i]="差评"}
}
detach()
#
write.table(sentiment,"sentiment.csv",sep=",",col.names = T,row.names = F)
sentiment<-read.csv("sentiment.csv",na.strings="",stringsAsFactors = F)#已经定义好class,和去噪声的表格
x10<-table(sentiment$c_bianhao)
write.table(x10,"x10.csv",sep=",",col.names = T,row.names = F)
x10<-read.csv("x10.csv",stringsAsFactors = F)
c_bianhao1<-x10[,1]#主要是为了保证唯一的店铺编号索引
head(sentiment)
zonghe<-data.frame("差"=c(1),"中"=c(1),"好"=c(1),"c_bianhao"=c(1),"date"=c(1))
x15=data.frame("差"=c(1,"a"),"中"=c(1,"a"),"好"=c(1,"a"),"c_bianhao"=c(1,"a"),"date"=c(0,"a"))
for (i in (1:length(c_bianhao1))){
  x11<-subset(sentiment,sentiment$c_bianhao==c_bianhao1[i])
  date1<-table(x11$date)
  write.table(date1,"date1.csv",sep=",",col.names = T,row.names = F)
  date1<-read.csv("date1.csv",stringsAsFactors = F)
  date1<-date1[,1]
  for (j in (1:length(date1))){
    x12<-subset(x11,x11$date == date1[j])
    x13<-aggregate(x12$critic1,by=list(class=x12$class1),sum)
    #转换成了数据框
    x13<-t(as.data.frame(x13))
    colnames(x13)<-x13[1,]
    x13<-as.data.frame(x13)
    x14<-dim(x13)
    #是一种类型的解法
    if (x14[2]==1) {
      if  (x13[1,1]=="差评") {
        x15$差=x13[,1]
        x15$中=c("中评",0)
        x15$好=c("好评",0)
        x15$c_bianhao=c(c_bianhao1[i],c_bianhao1[i])
        x15$date=c(date1[j],date1[j])
        zonghe<-rbind(zonghe,x15)
      }else if  (x13[1,1]=="中评") {
        x15$差=c("差评",0)
        x15$中=x13[,1]
        x15$好=c("好评",0)
        x15$c_bianhao=c(c_bianhao1[i],c_bianhao1[i])
        x15$date=c(date1[j],date1[j])
        zonghe<-rbind(zonghe,x15)
      }else {x15$差=c("差评",0)
      x15$中=c("中评",0)
      x15$好=x13[,1]
      x15$c_bianhao=c(c_bianhao1[i],c_bianhao1[i])
      x15$date=c(date1[j],date1[j])
      zonghe<-rbind(zonghe,x15)
      }
    }else if (x14[2]==2) {
      if ((x13[1,1]=="差评") & (x13[1,2]=="中评")) {
        x15$差=x13[,1]
        x15$中=x13[,2]
        x15$好=c("好评",0)
        x15$c_bianhao=c(c_bianhao1[i],c_bianhao1[i])
        x15$date=c(date1[j],date1[j])
        zonghe<-rbind(zonghe,x15)
      }else if (x13[1,1]=="差评" & x13[1,2]=="好评") {
        x15$差=x13[,1]
        x15$中=c("中评",0)
        x15$好=x13[,2]
        x15$c_bianhao=c(c_bianhao1[i],c_bianhao1[i])
        x15$date=c(date1[j],date1[j])
        zonghe<-rbind(zonghe,x15)
      }else{
        x15$差=c("差评",0)
        x15$中=x13[,2]
        x15$好=x13[,1]
        x15$c_bianhao=c(c_bianhao1[i],c_bianhao1[i])
        x15$date=c(date1[j],date1[j])
        zonghe<-rbind(zonghe,x15)
      }
    }else{
      x15$差=x13[,1]
      x15$中=x13[,2]
      x15$好=x13[,3]
      x15$c_bianhao=c(c_bianhao1[i],c_bianhao1[i])
      x15$date=c(date1[j],date1[j])
      zonghe<-rbind(zonghe,x15)
    }
  }
}
write.table(zonghe,"综合.csv",sep=",",col.names = T,row.names = F)

#改掉语料库的淘宝店评论情感值
zonghe<-data.frame("差"=c(1),"中"=c(1),"好"=c(1),"c_bianhao"=c(1),"date"=c(1))
x15=data.frame("差"=c(1,"a"),"中"=c(1,"a"),"好"=c(1,"a"),"c_bianhao"=c(1,"a"),"date"=c(0,"a"))
for (i in (1:length(c_bianhao1))){
  x11<-subset(sentiment,sentiment$c_bianhao==c_bianhao1[i])
  date1<-table(x11$date)
  write.table(date1,"date1.csv",sep=",",col.names = T,row.names = F)
  date1<-read.csv("date1.csv",stringsAsFactors = F)
  date1<-date1[,1]
  for (j in (1:length(date1))){
    x12<-subset(x11,x11$date == date1[j])
    x13<-aggregate(x12$critic2,by=list(class=x12$class2),sum)
    #转换成了数据框
    x13<-t(as.data.frame(x13))
    colnames(x13)<-x13[1,]
    x13<-as.data.frame(x13)
    x14<-dim(x13)
    #是一种类型的解法
    if (x14[2]==1) {
      if  (x13[1,1]=="差评") {
        x15$差=x13[,1]
        x15$中=c("中评",0)
        x15$好=c("好评",0)
        x15$c_bianhao=c(c_bianhao1[i],c_bianhao1[i])
        x15$date=c(date1[j],date1[j])
        zonghe<-rbind(zonghe,x15)
      }else if  (x13[1,1]=="中评") {
        x15$差=c("差评",0)
        x15$中=x13[,1]
        x15$好=c("好评",0)
        x15$c_bianhao=c(c_bianhao1[i],c_bianhao1[i])
        x15$date=c(date1[j],date1[j])
        zonghe<-rbind(zonghe,x15)
      }else {x15$差=c("差评",0)
      x15$中=c("中评",0)
      x15$好=x13[,1]
      x15$c_bianhao=c(c_bianhao1[i],c_bianhao1[i])
      x15$date=c(date1[j],date1[j])
      zonghe<-rbind(zonghe,x15)
      }
    }else if (x14[2]==2) {
      if ((x13[1,1]=="差评") & (x13[1,2]=="中评")) {
        x15$差=x13[,1]
        x15$中=x13[,2]
        x15$好=c("好评",0)
        x15$c_bianhao=c(c_bianhao1[i],c_bianhao1[i])
        x15$date=c(date1[j],date1[j])
        zonghe<-rbind(zonghe,x15)
      }else if (x13[1,1]=="差评" & x13[1,2]=="好评") {
        x15$差=x13[,1]
        x15$中=c("中评",0)
        x15$好=x13[,2]
        x15$c_bianhao=c(c_bianhao1[i],c_bianhao1[i])
        x15$date=c(date1[j],date1[j])
        zonghe<-rbind(zonghe,x15)
      }else{
        x15$差=c("差评",0)
        x15$中=x13[,2]
        x15$好=x13[,1]
        x15$c_bianhao=c(c_bianhao1[i],c_bianhao1[i])
        x15$date=c(date1[j],date1[j])
        zonghe<-rbind(zonghe,x15)
      }
    }else{
      x15$差=x13[,1]
      x15$中=x13[,2]
      x15$好=x13[,3]
      x15$c_bianhao=c(c_bianhao1[i],c_bianhao1[i])
      x15$date=c(date1[j],date1[j])
      zonghe<-rbind(zonghe,x15)
    }
  }
}
write.table(zonghe,"综合1.csv",sep=",",col.names = T,row.names = F)

#删除重复列
sentiment_1<-read.csv("综合.csv",na.strings="",stringsAsFactors = F)#已经好的全部评论数据
str(sentiment_1)
head(sentiment_1)
sentiment_2<-sentiment_1
for (i in (1:length(sentiment_1$date))){
  x11<-sentiment_1$差1
  if (x11[i]=="差评"){
    sentiment_1<-sentiment_1[-i,]
  }else
    sentiment_1[i,]<-sentiment_1[-i,] 
}
write.table(sentiment_1,"sentiment_1.csv",sep=",",col.names = T,row.names = F)
dim(sentiment_1)
#接下来需要对特定区间段的日期加标签,例如4月7号,4月8号,4月9号都定义在4月9号的标签
sentiment<-read.csv("C:/Users/11565/Desktop/合并二/可以用数据/sentiment_1.csv",na.strings="",stringsAsFactors = F)
x11<-as.data.frame(table(sentiment$date))
write.table(x11,"date.csv",col.names = T,row.names = F,sep = ",")#读出来的date并没有排序
#在读date之前需要date内部的日期从小到大排列一下
x12<-read.csv("C:/Users/11565/Desktop/合并二/可以用数据/date.csv",na.strings="",stringsAsFactors = F)
i=1
while (i<(length(sentiment$date)+1)){
  #for (i in (1:length(sentiment$date)))
  j=1
  while (j<(length(x12[,1])+1)){
    if (sentiment$date[i]%in%(x12[,1][j:(j+2)])){
      sentiment$biaoqian[i]=x12[,1][(j+2)]
      break
    }
    j<-j+3
  }
  i<-i+1
}
write.table(sentiment,"sentiment_2.csv",col.names = T,row.names = F,sep = ",")

#将分组的数据进行加和
sentiment<-read.csv("C:/Users/11565/Desktop/合并二/可以用数据/sentiment_2.csv",na.strings="",stringsAsFactors = F)
x10<-table(sentiment$c_bianhao)
write.table(x10,"x10.csv",sep=",",col.names = T,row.names = F)
x10<-read.csv("x10.csv",stringsAsFactors = F)
c_bianhao1<-x10[,1]#主要是为了保证唯一的店铺编号索引
zonghe1<-data.frame("差1"=c(),"中1"=c(),"好1"=c(),"中2"=c(),"好2"=c(),date=c())
for (i in (1:length(c_bianhao1))){
  x11<-subset(sentiment,sentiment$c_bianhao==c_bianhao1[i])
  #这条命令有待商榷,是行不通的必须是选定编号商品的数据的日期,但是需要读出来以后才能进行下一步操作
  #不读出来就会报错
  biaoqian<-unique(x11$biaoqian)
  write.table(biaoqian,"标签1.csv",sep=",",col.names = T,row.names = F)
  biaoqian1<-read.csv("标签1.csv",stringsAsFactors = F)
  biaoqian_1<-biaoqian1[,1]
  for (j in (1:length(biaoqian_1))){
    x12<-subset(x11,x11$biaoqian ==  biaoqian_1[j])
    x13<-aggregate(x12[3:8],by=list(date=x12$biaoqian),sum)
    x13$c_bianhao<-c_bianhao1[i]
    zonghe1<-rbind(zonghe1,x13)
  }
}
write.table(zonghe1,"sentiment_3.csv",col.names = T,row.names = F,sep = ",")
#已经实现每次搜集数据时的好评数、差评数了

#累加求和
#累加求和
sentiment<-read.csv("C:/Users/11565/Desktop/合并二/可以用数据/sentiment_3.csv",na.strings="",stringsAsFactors = F)
x10<-unique(sentiment$c_bianhao)
write.table(x10,"x10.csv",sep=",",col.names = T,row.names = F)
x10<-read.csv("x10.csv",stringsAsFactors = F)
c_bianhao1<-x10[,1]#主要是为了保证唯一的店铺编号索引
#事先定义累加的函数
fd<-function(x){
  for (i in 2:length(x)){
    x[i]=x[i-1]+x[i]
  }
  x
}
zonghe1<-data.frame("差1"=c(),"中1"=c(),"好1"=c(),"中2"=c(),"好2"=c(),date=c(),c_bianhao=c())
for (i in (1:length(c_bianhao1))){
  x11<-subset(sentiment,sentiment$c_bianhao==c_bianhao1[i])
  if (length(x11$差1)==1){
    zonghe1<-rbind(zonghe1,x11)
  }else
    x111<-as.data.frame(apply(x11[,2:7],2,fd))
  x111$date=x11$date
  x111$c_bianhao=x11$c_bianhao
  zonghe1<-rbind(zonghe1,x111)
}
write.table( zonghe1,"sentiment_4.csv",col.names = T,row.names = F,sep = ",")
#sentiment_4.csv就是最后要求的表格


 

  • 1
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值