赛题说明链接
http://download.csdn.net/detail/q383700092/9538252
R语言 1379640 918539 2021961 1365166 5个 。两个双11.两个双12 一个6.18(年中大促)
统计全国仓库预测的前两周 商品_仓库_个数rm(list=ls())
w=read.table("F:/笔记学习/天池比赛/菜鸟需求预测与分仓规划/item_feature01.csv",header = FALSE,sep="\t")
e=unique(w$V1)
x=0
for (i in 1:length(e)){
w1=w[which(w$V1==e[i]),]
t1=w1[which(w1$V3 <= "20151220"),]
t2=t1[t1$V3>="20151206",]
x[i]=sum(t2$V2) #满足两个条件 拆开写
}
y=c(1:length(e))
da=data.frame(e,y,x)
da$y='all'
write.table (da, file ="F:/笔记学习/天池比赛/菜鸟需求预测与分仓规划/result01_01.csv",sep =",",row.names = F,col.names=F,quote =F)
统计区域仓库预测的前两周 商品_仓库_个数
rm(list=ls())
w=read.table("F:/笔记学习/天池比赛/菜鸟需求预测与分仓规划/item_store_feature01.csv",header = FALSE,sep="\t")
e=unique(w$V1)
Group.1=0
Group.2=0
x=0
re=data.frame(Group.1,Group.2,x)
for (i in 1:length(e)){
w1=w[which(w$V1==e[i]),]
t1=w1[which(w1$V4 <= "20151210"),]
w2=t1[t1$V4>="20151128",]
x1=aggregate(w2$V3, list(w2$V1,w2$V2), sum)
re=rbind(re,x1)
}
re=re[-1,]
write.table (re, file ="F:/笔记学习/天池比赛/菜鸟需求预测与分仓规划/result02_02.csv",sep =",",row.names = F,col.names=F,quote =F)
方法二 不用聚合
rm(list=ls())
w=read.table("F:/笔记学习/天池比赛/菜鸟需求预测与分仓规划/item_store_feature01.csv",header = FALSE,sep="\t")
r1=as.numeric(str_c(w$V1,w$V2,sep='')) #id+仓库号为唯一标示
e=unique(r1)
r=data.frame(r1,w$V3,w$V4)
x=0
for (i in 1:length(e)){
w1=r[which(r$r1==e[i]),]
t1=w1[which(w1$w.V4>="20151128"),]
t2=t1[which(t1$w.V4 <= "20151210"),]
x[i]=sum(t2$w.V3)
}
tmp=data.frame(w$V1,w$V2)
e1=unique.data.frame(tmp)
da=data.frame(e1,e,x)
da2=data.frame(e1,x)
write.table (da2, file ="F:/笔记学习/天池比赛/菜鸟需求预测与分仓规划/result02_02.csv",sep =",",row.names = F,col.names=F,quote =F)
查看未统计的预测的前两周 商品_仓库_个数
rm(list=ls())
w=read.table("F:/笔记学习/天池比赛/菜鸟需求预测与分仓规划/result04.csv",header = FALSE,sep=",")
V4=rep(1,length(w$V1)) #重复数字
d=data.frame(w,V4)
xx=aggregate(d$V4, list(d$V1,d$V2), sum)
y=xx[which(xx$x==1),]
y$x=0
write.table (y, file ="F:/笔记学习/天池比赛/菜鸟需求预测与分仓规划/result05.csv",sep =",",row.names = F,col.names=F,quote =F)
每一个唯一e(i)的数据框
w1=w[which(w$V1==e[1]),]
数据框满足多个条件拆开写
t1=w1[which(w1$V2 <= "20151227"),]
t2=t1[t1$V2>="20151214",]
求满足条件的V3列的和 条件一定要拆开
t1=w1[which(w1$V2 <= "20151227"),]
t2=t1[t1$V2>="20151214",]
sum(t2$V3)
a=c(1,1,1,2,2,2)
b=c(2,3,4,3,4,2)
d=data.frame(a,b)
1,2
1,3
1,4
2,3
2,4
2,2
e=unique(d$a)
for (i in 1:2){
x[i]=sum(d[which(d$a==e[i]),]$b)
}
which(d$b <= 3)
分1组d$a求d$b和
tapply(d$b,d$a,sum)
分两组d$a,d$b求d$b和
aggregate(d$b, list(d$a,d$b), sum)
R语言中的横向数据合并merge及纵向数据合并rbind的使用
我们经常会遇到两个数据框拥有相同的时间或观测值,但这些列却不尽相同。处理的办法就是使用
merge(x, y ,by.x = ,by.y = ,all = ) 函数。
#合并
ID<-c(1,2,3,4)
name<-c("A","B","C","D")
score<-c(60,70,80,90)
student1<-data.frame(ID,name)
student2<-data.frame(ID,score)
total_student1<-merge(student1,student2,by="ID")
total_student1
#横向追加
ID<-c(1,2,3,4)
name<-c("A","B","C","D")
score<-c(60,70,80,90)
sex<-c("M","F","M","M")
student1<-data.frame(ID,name)
student2<-data.frame(score,sex)
total_student2<-cbind(student1,student2)
total_student2
#纵向追加
ID<-c(1,2,3,4)
name<-c("A","B","C","D")
student1<-data.frame(ID,name)
ID<-c(5,6,7,8)
name<-c("E","F","G","H")
student2<-data.frame(ID,name)
total_student3<-rbind(student1,student2)
total_student3
R预测过程
rm(list=ls())
w=read.table("F:/笔记学习/天池比赛/菜鸟需求预测与分仓规划/item_feature01.csv",header = FALSE,sep="\t")
e=unique(w$V1)
d=w[which(w$V1==10161w[which(w$V1==101615),]5),]
x=c(1:length(d$V3))
y=d$V3
plot(x,y,'b')
取预测值的0.8
rm(list=ls())
w=read.table("F:/笔记学习/天池比赛/菜鸟需求预测与分仓规划/result06.csv",header = FALSE,sep=",")
head(w)
q=floor(0.8*w$V3) #四舍五入,floor(1.6)向下取整 ceiling(1.6)向上取整
d=data.frame(w$V1,w$V2,q)
write.table (d, file ="F:/笔记学习/天池比赛/菜鸟需求预测与分仓规划/result06_1.csv",sep =",",row.names = F,col.names=F,quote =F)
全国总仓库生成14天间隔的序列
rm(list=ls())
d=seq(as.Date("2014/10/06"),as.Date("2015/12/27"), by="day") #生成时间序列
library("stringr")
d=as.numeric(str_replace(str_replace(d,"-",""),"-","")) #替换替换匹配的部分library("stringr")
#difftime(as.Date("2014-4-25"),as.Date("2014-4-20")) #时间差
d=t(as.matrix(as.data.frame(d)))
d=matrix(d,ncol=14,byrow = TRUE) #将矩阵按行切割成14列
w=read.table("F:/笔记学习/天池比赛/菜鸟需求预测与分仓规划/item_feature01.csv",header = FALSE,sep="\t")
e=unique(w$V1)
x=matrix(rep(0,32),nrow=length(e),ncol=32)
for (i in 1:length(e)){
w1=w[which(w$V1==e[i]),]
for (j in 1:32){
t1=w1[which(w1$V3>=d[j,1]),]
t2=t1[which(t1$V3 <= d[j,14]),]
#tmp=w1[which(w1$V3>=d[j,1]),][which(w1$V3 <= d[j,14]),]$V2
#tmp[is.na(tmp)]=0
#x[i,j]=sum(tmp)
x[i,j]=sum(t2$V2)
}
}
x[is.na(x)]=0
y=c(1:length(e))
da=data.frame(e,y,x)
da$y='all'
write.table (da, file ="F:/笔记学习/天池比赛/菜鸟需求预测与分仓规划/time01.csv",sep =",",row.names = F,col.names=F,quote =F)
分仓库生成14天间隔的序列
rm(list=ls())
d=seq(as.Date("2014/10/06"),as.Date("2015/12/27"), by="day") #生成时间序列
library("stringr")
d=as.numeric(str_replace(str_replace(d,"-",""),"-","")) #替换替换匹配的部分library("stringr")
d=t(as.matrix(as.data.frame(d)))
d=matrix(d,ncol=14,byrow = TRUE) #将矩阵按行切割成14列
w=read.table("F:/笔记学习/天池比赛/菜鸟需求预测与分仓规划/item_store_feature01.csv",header = FALSE,sep="\t")
r1=as.numeric(str_c(w$V1,w$V2,sep='')) #id+仓库号为唯一标示
e=unique(r1)
r=data.frame(r1,w$V3,w$V4)
x=matrix(rep(0,32),nrow=length(e),ncol=32)
for (i in 1:length(e)){
w1=r[which(r$r1==e[i]),]
for (j in 1:32){
t1=w1[which(w1$w.V4>=d[j,1]),]
t2=t1[which(t1$w.V4 <= d[j,14]),]
x[i,j]=sum(t2$w.V3)
}
}
x[is.na(x)]=0
tmp=data.frame(w$V1,w$V2)
e1=unique.data.frame(tmp)
da=data.frame(e1,e,x)
write.table (da, file ="F:/笔记学习/天池比赛/菜鸟需求预测与分仓规划/time02.csv",sep =",",row.names = F,col.names=F,quote =F)
全国预测
rm(list=ls())
w=read.table("F:/笔记学习/天池比赛/菜鸟需求预测与分仓规划/time01.csv",header = FALSE,sep=",")
library("forecast")
d=0
for (i in 1:length(w$V1)){
a=w[i,]
y=as.numeric(a)[3:34]
d[i]=floor(forecast(auto.arima(y),1)$mean)
}
d[which(d<0)]=0
da=data.frame(w$V1,w$V2,d)
write.table (da, file ="F:/笔记学习/天池比赛/菜鸟需求预测与分仓规划/result08_01.csv",sep =",",row.names = F,col.names=F,quote =F)
查看单独模型
a=w[6,]
y=as.numeric(a)[3:34]
plot(forecast(auto.arima(y),1))
分仓预测
rm(list=ls())
w=read.table("F:/笔记学习/天池比赛/菜鸟需求预测与分仓规划/time02.csv",header = FALSE,sep=",")
library("forecast")
d=0
for (i in 1:length(w$V1)){
a=w[i,]
y=as.numeric(a)[4:35]
d[i]=floor(forecast(auto.arima(y),1)$mean)
}
d[which(d<0)]=0
da=data.frame(w$V1,w$V2,d)
write.table (da, file ="F:/笔记学习/天池比赛/菜鸟需求预测与分仓规划/result08_02.csv",sep =",",row.names = F,col.names=F,quote =F)
查看单独模型
a=w[6,]
y=as.numeric(a)[4:35]
plot(forecast(auto.arima(y),1))
分成7天一个周期
全国总仓库生成7天间隔的序列
rm(list=ls())
d=seq(as.Date("2014/10/06"),as.Date("2015/12/27"), by="day") #生成时间序列
library("stringr")
d=as.numeric(str_replace(str_replace(d,"-",""),"-","")) #替换替换匹配的部分library("stringr")
#difftime(as.Date("2014-4-25"),as.Date("2014-4-20")) #时间差
d=t(as.matrix(as.data.frame(d)))
d=matrix(d,ncol=7,byrow = TRUE) #将矩阵按行切割成14列
w=read.table("F:/笔记学习/天池比赛/菜鸟需求预测与分仓规划/item_feature01.csv",header = FALSE,sep="\t")
e=unique(w$V1)
x=matrix(rep(0,64),nrow=length(e),ncol=64)
for (i in 1:length(e)){
w1=w[which(w$V1==e[i]),]
for (j in 1:64){
t1=w1[which(w1$V3>=d[j,1]),]
t2=t1[which(t1$V3 <= d[j,7]),]
x[i,j]=sum(t2$V2)
}
}
x[is.na(x)]=0
y=c(1:length(e))
da=data.frame(e,y,x)
da$y='all'
write.table (da, file ="F:/笔记学习/天池比赛/菜鸟需求预测与分仓规划/time7_01.csv",sep =",",row.names = F,col.names=F,quote =F)
分仓库生成7天间隔的序列
rm(list=ls())
d=seq(as.Date("2014/10/06"),as.Date("2015/12/27"), by="day") #生成时间序列
library("stringr")
d=as.numeric(str_replace(str_replace(d,"-",""),"-","")) #替换替换匹配的部分library("stringr")
d=t(as.matrix(as.data.frame(d)))
d=matrix(d,ncol=7,byrow = TRUE) #将矩阵按行切割成14列
w=read.table("F:/笔记学习/天池比赛/菜鸟需求预测与分仓规划/item_store_feature01.csv",header = FALSE,sep="\t")
r1=as.numeric(str_c(w$V1,w$V2,sep='')) #id+仓库号为唯一标示
e=unique(r1)
r=data.frame(r1,w$V3,w$V4)
x=matrix(rep(0,64),nrow=length(e),ncol=64)
for (i in 1:length(e)){
w1=r[which(r$r1==e[i]),]
for (j in 1:64){
t1=w1[which(w1$w.V4>=d[j,1]),]
t2=t1[which(t1$w.V4 <= d[j,7]),]
x[i,j]=sum(t2$w.V3)
}
}
x[is.na(x)]=0
tmp=data.frame(w$V1,w$V2)
e1=unique.data.frame(tmp)
da=data.frame(e1,e,x)
write.table (da, file ="F:/笔记学习/天池比赛/菜鸟需求预测与分仓规划/time7_02.csv",sep =",",row.names = F,col.names=F,quote =F)
全国预测
rm(list=ls())
w=read.table("F:/笔记学习/天池比赛/菜鸟需求预测与分仓规划/time7_01.csv",header = FALSE,sep=",")
library("forecast")
d=0
for (i in 1:length(w$V1)){
a=w[i,]
y=as.numeric(a)[3:66]
d[i]=sum(floor(forecast(auto.arima(y),2)$mean))
}
d[which(d<0)]=0
da=data.frame(w$V1,w$V2,d)
write.table (da, file ="F:/笔记学习/天池比赛/菜鸟需求预测与分仓规划/result08_07_1.csv",sep =",",row.names = F,col.names=F,quote =F)
查看单独模型
a=w[6,]
y=as.numeric(a)[3:66]
plot(forecast(auto.arima(y),2))
分仓预测
rm(list=ls())
w=read.table("F:/笔记学习/天池比赛/菜鸟需求预测与分仓规划/time7_02.csv",header = FALSE,sep=",")
library("forecast")
d=0
for (i in 1:length(w$V1)){
a=w[i,]
y=as.numeric(a)[4:67]
d[i]=sum(floor(forecast(auto.arima(y),2)$mean)) ##ceiling向上取整
}
d[which(d<0)]=0
da=data.frame(w$V1,w$V2,d)
write.table (da, file ="F:/笔记学习/天池比赛/菜鸟需求预测与分仓规划/result08_07_2.csv",sep =",",row.names = F,col.names=F,quote =F)
查看单独模型
a=w[6,]
y=as.numeric(a)[4:67]
plot(forecast(auto.arima(y),2))
建立评估模型
预测最后一个,平方和最小
rm(list=ls())
w=read.table("F:/笔记学习/天池比赛/菜鸟需求预测与分仓规划/time7_01.csv",header = FALSE,sep=",")
library("forecast")
d=0
for (i in 1:length(w$V1)){
a=w[i,]
y=as.numeric(a)[3:64]
d[i]=sum(floor(forecast(auto.arima(y),2)$mean))
}
d[which(d<0)]=0
t=w$V65+w$V66
sqrt(mean((d-t)^2)) #213.5006 102.3938
round(abs(d-t)/t,2)
改动1---ceiling向上取整(无明显改动)
d=0
for (i in 1:length(w$V1)){
a=w[i,]
y=as.numeric(a)[3:64]
d[i]=sum(ceiling(forecast(auto.arima(y),2)$mean))
}
d[which(d<0)]=0
t=w$V65+w$V66
sqrt(mean((d-t)^2)) #213.8533 102.3783
改动2--ets模型 (比较明显降低)
d=0
for (i in 1:length(w$V1)){
a=w[i,]
y=as.numeric(a)[3:64]
d[i]=sum(floor(predict(ets(y),2)$mean))
}
d[which(d<0)]=0
t=w$V65+w$V66
sqrt(mean((d-t)^2)) #163.6495 86.44429
改动3--均值预测meanf (比较明显降低)
d=0
for (i in 1:length(w$V1)){
a=w[i,]
y=as.numeric(a)[3:64]
d[i]=sum(floor(meanf(y,h=2)$mean))
}
d[which(d<0)]=0
t=w$V65+w$V66
sqrt(mean((d-t)^2)) #131.1134 136.9257
改动4--naive预测 (效果较差) ARIMA(0,1,0)
d=0
for (i in 1:length(w$V1)){
a=w[i,]
y=as.numeric(a)[3:64]
d[i]=sum(floor(naive(y,h=2)$mean))
}
d[which(d<0)]=0
t=w$V65+w$V66
sqrt(mean((d-t)^2)) #253.7406 100.6428
改动5--随机游走预测rwf (效果较差)
d=0
for (i in 1:length(w$V1)){
a=w[i,]
y=as.numeric(a)[3:64]
d[i]=sum(floor(rwf(y,h=2,drift=T)$mean))
}
d[which(d<0)]=0
t=w$V65+w$V66
sqrt(mean((d-t)^2)) #262.56 102.3345
改动6--指数平滑预测ses(效果一般)
d=0
for (i in 1:length(w$V1)){
a=w[i,]
y=as.numeric(a)[3:64]
d[i]=sum(floor(ses(y,h=2,initial='simple',alpha=0.9)$mean))
}
d[which(d<0)]=0
t=w$V65+w$V66
sqrt(mean((d-t)^2)) #229.4119 92.77852
改动7--霍尔特 - 温特斯过滤(效果一般)
d=0
for (i in 1:length(w$V1)){
a=w[i,]
y=as.numeric(a)[3:64]
d[i]=sum(floor(holt(y,h=2,damped=F,initial="simple",beta=0.1)$mean))
}
d[which(d<0)]=0
t=w$V65+w$V66
sqrt(mean((d-t)^2)) #179.5424 89.7644
改动8--前两周预测
d=0
d=w$V63+w$V64
t=w$V65+w$V66
sqrt(mean((d-t)^2)) #77.96011
改动9--减弱双11双22影响(0.5*) (效果明显)
as.numeric(w[1,])[3:64]
rm(list=ls())
w=read.table("F:/笔记学习/天池比赛/菜鸟需求预测与分仓规划/time7_01.csv",header = FALSE,sep=",")
w$V64=floor(0.5*w$V64) #双12
w$V60=floor(0.5*w$V60) #双11 w$V60=floor(0.25*w$V60) #双11
library("forecast")
d=0
for (i in 1:length(w$V1)){
a=w[i,]
y=as.numeric(a)[3:64]
d[i]=sum(floor(forecast(auto.arima(y),2)$mean))
}
d[which(d<0)]=0
t=w$V65+w$V66
sqrt(mean((d-t)^2)) #104.6499 双11系数改为0.25 102.3938
改动10----线性拟合
rm(list=ls())
w=read.table("F:/笔记学习/天池比赛/菜鸟需求预测与分仓规划/time7_01.csv",header = FALSE,sep=",")
library("forecast")
d=0
for (i in 1:length(w$V1)){
a=w[i,]
y=as.numeric(a)[3:64]
x=c(1:length(y))
#r=lm(y~x+1)
r=lm(y~x+I(x^2))
z=data.frame(x=c(65,66))
d[i]=sum(floor(predict(r,z)))
}
d[which(d<0)]=0
t=w$V65+w$V66
sqrt(mean((d-t)^2)) #140.8061 128.978 125.378
使用改进-减弱双11双22影响(0.5* 0.25*)
rm(list=ls())
w=read.table("F:/笔记学习/天池比赛/菜鸟需求预测与分仓规划/time7_01.csv",header = FALSE,sep=",")
w$V64=floor(0.5*w$V64) #双12
w$V60=floor(0.25*w$V60) #双11
library("forecast")
d=0
for (i in 1:length(w$V1)){
a=w[i,]
y=as.numeric(a)[3:66]
d[i]=sum(floor(forecast(auto.arima(y),2)$mean))
}
d[which(d<0)]=0
da=data.frame(w$V1,w$V2,d)
write.table (da, file ="F:/笔记学习/天池比赛/菜鸟需求预测与分仓规划/result09_07_1.csv",sep =",",row.names = F,col.names=F,quote =F)
rm(list=ls())
w=read.table("F:/笔记学习/天池比赛/菜鸟需求预测与分仓规划/time7_02.csv",header = FALSE,sep=",")
w$V64=floor(0.5*w$V64) #双12
w$V60=floor(0.25*w$V60) #双11
library("forecast")
d=0
for (i in 1:length(w$V1)){
a=w[i,]
y=as.numeric(a)[4:67]
d[i]=sum(floor(forecast(auto.arima(y),2)$mean)) ##ceiling向上取整
}
d[which(d<0)]=0
da=data.frame(w$V1,w$V2,d)
write.table (da, file ="F:/笔记学习/天池比赛/菜鸟需求预测与分仓规划/result09_07_2.csv",sep =",",row.names = F,col.names=F,quote =F)
提交结果去负数
w$V3[which(w$V3<0)]=0
线下评测
ID 仓库 数量 ID 仓库 补少-补多
分解补多补少
全国范围内的成本
c1=补少*max(实际-预测,0)+补多*max(预测-实际,0)
c2=补少*max(实际-预测,0)+补多*max(预测-实际,0)
总的衡量标准是上面两者的相加:
c=c1+c2
rm(list=ls())
x=read.table("F:/笔记学习/天池比赛/菜鸟需求预测与分仓规划/ping02.csv",header = FALSE,sep=",")
t1=0
t2=0
for (i in 1:length(x$V1)){
t1[i]=x[i,]$V5*max(x[i,]$V4-x[i,]$V3,0)
t2[i]=x[i,]$V6*max(x[i,]$V3-x[i,]$V4,0)
}
c1=sum(t1)+sum(t2)
生成评测数据"hdfs://hadoop-master.dragon.org:9000/bs/music/pingfeng/input/"放入pingdt01.csv"hdfs://hadoop-master.dragon.org:9000/bs/music/pingfeng/output"删除 结果重命名ping02.csv
w=read.table("F:/笔记学习/天池比赛/菜鸟需求预测与分仓规划/time7_01.csv",header = FALSE,sep=",")
library("forecast")
d=0
for (i in 1:length(w$V1)){
a=w[i,]
y=as.numeric(a)[3:64]
d[i]=sum(floor(meanf(y,h=2)$mean)) #预测值
}
t=w$V65+w$V66 #实际值
da=data.frame(w$V1,w$V2,d,t)
write.table (da, file ="F:/笔记学习/天池比赛/菜鸟需求预测与分仓规划/pingdt01.csv",sep =",",row.names = F,col.names=F,quote =F)
matlab查看分布情况
clear,clc
load('F:/笔记学习/天池比赛/菜鸟需求预测与分仓规划/CAINIAO data p1_20160408/item_feature1.csv')
id=item_feature1(:,2); %取矩阵第二列
id2=unique(id);%去重
t1=item_feature1(find(item_feature1(:,2)==300),:);%找见满足条件的矩阵 41655
t2=sortrows(t1,1);%按第一列排序
x=[1:length(t1(:,1))];
plot(x,t1(:,30),'r-');
hold on
plot(x,t1(:,31),'b-');
legend('非聚划算件数','测试');
测试两个序列相关性
xg=xcorr(t1(:,30),t1(:,7));
length(find(xg(:,1)>0.6))/length(xg)
plot(x,t1(:,30),'r-');
hold on
plot(x,t1(:,9),'b-'); %cart_ipv被加购次数
plot(x,t1(:,10),'m-'); %cart_uv 加购人次
plot(x,t1(:,12),'g-'); %num_gmv 拍下笔数
plot(x,t1(:,14),'b-'); %qty_gmv 拍下件数
plot(x,t1(:,15),'b-'); %unum_gmv 拍下UV
plot(x,t1(:,17),'b-'); %num_alipay 成交笔数
plot(x,t1(:,18),'b-'); %qty_alipay 成交件数 有的商品一模一样,非聚划算物品
plot(x,t1(:,19),'b-'); %unum_alipay 成交人次
plot(x,t1(:,20),'b-'); %ztc_pv_ipv直通车引导浏览次数
plot(x,t1(:,24),'b-'); %ztc_pv_uv直通车引导浏览人次
plot(x,t1(:,26),'b-'); %ss_pv_uv搜索引导浏览人次
plot(x,t1(:,28),'b-'); %num_alipay_njhs非聚划算支付笔数
plot(x,t1(:,31),'b-'); %unum_alipay_njhs非聚划算支付人次
legend('非聚划算件数','被加购次数','加购人次','拍下笔数');
id=item_feature1(:,6);
id2=unique(id);
28个叶子类目ID
11个大类目ID
168个品牌ID
401个供货商
是否有参与聚划算
节日 周末 工作日
周末特征,工作日特征
促销节日,传统节日特征,节前节后特征, 月初月末
前7,14,21天的销量,搜藏量,流量等,价格
判断周几
d=seq(as.Date("2014/10/06"),as.Date("2015/12/27"), by="day")
d1=weekdays(d)
library("stringr")
d=as.numeric(str_replace(str_replace(d,"-",""),"-","")) #替换替换匹配的部分library("stringr")
d1=str_replace(d1,"星期五","5") #替换替换匹配的部分library("stringr")
d1=str_replace(d1,"星期六","6")
d1=str_replace(d1,"星期日","7")
d1=str_replace(d1,"星期一","1")
d1=str_replace(d1,"星期二","2")
d1=str_replace(d1,"星期三","3")
d1=str_replace(d1,"星期四","4")
d1=as.numeric(d1)
time=data.frame(d,d1)
rm(list=ls())
w=read.table("F:/笔记学习/天池比赛/菜鸟需求预测与分仓规划/CAINIAO data p1_20160408/item_feature1.csv",header = FALSE,sep=",")
e=unique(w$V2)
for (i in 1:length(e)){
w1=w[which(w$V2==e[i]),]
w1=w1[order(w1[,1],decreasing=F),] #顺序从小到大 排序
tn=w1$V2 #商品id
t1=w1$V1 #日期
x=t1
library("stringr")
d=str_c(str_sub(x, 1, 4),str_sub(x, 5, 6),str_sub(x, 7, 8),sep='-')
d1=weekdays(as.Date(d))
d1=str_replace(d1,"星期五","5") #替换替换匹配的部分library("stringr")
d1=str_replace(d1,"星期六","6")
d1=str_replace(d1,"星期日","7")
d1=str_replace(d1,"星期一","1")
d1=str_replace(d1,"星期二","2")
d1=str_replace(d1,"星期三","3")
d1=str_replace(d1,"星期四","4")
t2=as.numeric(d1) #星期
t7=w1$V29/w1$V30 #商品单价
tmp=length(t7[which(t7=='NaN')])
if(t7[1]=='NaN'){
t7[1]=0
}
while(tmp>0){
t7[which(t7=='NaN')]=t7[which(t7=='NaN')-1]
tmp=length(t7[which(t7=='NaN')])
}
t8=w1$V30 #销量
re=data.frame(tn,t1,t2,t7,t8)
plot(re$t1,re$t8,'l')
#write.table (re,append = TRUE, file ="F:/笔记学习/天池比赛/菜鸟需求预测与分仓规划/rere02.csv",sep =",",row.names = F,col.names=F,quote =F)
}
#单独一个物品提取特征值
w1=w[which(w$V2==e[200]),]
w1=w1[order(w1[,1],decreasing=F),] #顺序从小到大
tn=w1$V2 #商品id
t1=w1$V1 #日期
x=t1
library("stringr")
d=str_c(str_sub(x, 1, 4),str_sub(x, 5, 6),str_sub(x, 7, 8),sep='-')
d1=weekdays(as.Date(d))
d1=str_replace(d1,"星期五","5") #替换替换匹配的部分library("stringr")
d1=str_replace(d1,"星期六","6")
d1=str_replace(d1,"星期日","7")
d1=str_replace(d1,"星期一","1")
d1=str_replace(d1,"星期二","2")
d1=str_replace(d1,"星期三","3")
d1=str_replace(d1,"星期四","4")
t2=as.numeric(d1) #星期
t3=w1$V3 #叶子类目ID
t4=w1$V4 #大类目ID
t5=w1$V5 #品牌ID
t6=w1$V6 #供应商ID
t7=w1$V29/w1$V30 #商品单价
tmp=length(t7[which(t7=='NaN')])
if(t7[1]=='NaN'){
t7[1]=0
}
while(tmp>0){
t7[which(t7=='NaN')]=t7[which(t7=='NaN')-1]
tmp=length(t7[which(t7=='NaN')])
}
t8=w1$V30 #销量
re=data.frame(tn,t1,t2,t3,t4,t5,t6,t7,t8)
plot(re$t1,re$t8,'l')
write.table (re,append = TRUE, file ="F:/笔记学习/天池比赛/菜鸟需求预测与分仓规划/rere01.csv",sep =",",row.names = F,col.names=F,quote =F)
线性拟合
rm(list=ls())
w=read.table("F:/笔记学习/天池比赛/菜鸟需求预测与分仓规划/rere01.csv",header = FALSE,sep=",")
e=unique(w$V1)
w1=w[which(w$V1==e[1]),]
d1=c(1:length(w1$V1))
da=data.frame(d1,w1$V3,w1$V4,w1$V5,w1$V6,w1$V7,w1$V8,w1$V9)
#数据归一化--消除量纲对数据结构的影响
dad=scale(da,center=F,scale=T)
colnames(dad) <- c("V1","V2","V3","V4","V5","V6","V7","V8")
dad=as.data.frame(dad)
library("forecast")
dad1=dad[c(1:(length(dad$V1)-5)),]
dad2=dad[c((length(dad$V1)-4):length(dad$V1)),]
V8=dad1$V8
V1=dad1$V1 #顺序
V2=dad1$V2 #星期
V7=dad1$V7 #商品单价
r=lm(V8~V1+I(V2)+I(V7))
z=dad2[,c(1,2,7)]
row.names(z)<-c(1:length(dad2$V1))
predict(r,z)
dad2$V8
前面没有有个点特别多101615 102955 145284 147185
只要是双11双12量特别大,其他时间基本很少123571 11108