模糊匹配:就是把相似的名词配对起来,案例如,携程爬取下来的景区名称和同城名称并不是完全一致的,但几个关键词是一致的。然后可以通过分词,投票等方法把它们匹配上。
library(rJava)
library(Rwordseg)
setwd("D:\\Documents\\work\\匹配")
total_data<-read.csv("门票景区.csv",stringsAsFactors=FALSE)
sub_data<-read.csv("match.csv",stringsAsFactors=FALSE)
sub_data$split<-segmentCN(sub_data$POI名称)
sub_data$lvmm_id<-""
sub_data$lvmm_pro<-""
for (j in 1:length(sub_data[,1])){
words<-unlist(sub_data$split[j])
len<-length(words)
index <- 1
total_data$score<-0
while(index <=len){
for (i in 1:length(total_data[,1])){
if (grepl(words[index],total_data$产品名称[i])==T){
total_data$score[i]<-total_data$score[i]+1
}
}
index <-index +1
}
sub_data[j,c("lvmm_id","lvmm_pro")]<-total_data[which(total_data$score==max(total_data$score)),][1,c(1,2)]
print(j/length(sub_data[,1]))
}
sub_data$province<-total_data[match(sub_data$lvmm_id,total_data$产品ID),"产品目的省份"]
write.csv(sub_data[,c(1,3,4,5)],"result.csv",row.names = FALSE)