R 携程评论可视化

第一部分  爬虫爬取阿坝州主要景点携程评论数据

import re
import requests
import json
import time
import pandas as pd

head = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0'}
postUrl = "https://sec-m.ctrip.com/restapi/soa2/12530/json/viewCommentList"


urls = [

    ['1409209', '中国古羌城'],
    ['63315', '四姑娘山'],
    ['77380', '九寨沟风景区'],
    ['78031', '毕棚沟'],
    ['2673', '黄龙风景名胜区'],
    ['63306', '达古冰山地质公园'],
    ['136686', '若尔盖花湖'],
    ['1922979', '卧龙中华大熊猫苑神树坪基地'],
    ['63317', '红原—若尔盖大草原'],
    ['77698', '叠溪-松坪沟风景区'],
    ['63305', '米亚罗风景区'],
    ['63376', '黄河九曲第一湾'],
    ['136693', '双桥沟'],
    ['63351', '桃坪羌寨'],
    ['55651','松潘古城'],
    ['1407342','太子岭滑雪场']
    ['127927','映秀地震遗址'],
    ['1996233','鹧鸪山自然公园滑雪场']
    ['63371','卓克基土司官寨文化旅游景区']
]

for data in urls:
    data_1 = {
        "pageid": "10650000804",
        "viewid": data[0],
        "tagid": "-11",
        "pagenum": "1",
        "pagesize": "10",
        "contentType": "json",
        "SortType": "1",
        "head": {
            "appid": "100013776",
            "cid": "09031164110643039198",
            "ctok": "",
            "cver": "1.0",
            "lang": "01",
            "sid": "8888",
            "syscode": "09",
            "auth": "",
            "extension": [
                {
                    "name": "protocal",
                    "value": "https"
                }
            ]
        },
        "ver": "7.10.3.0319180000"
    }

    html = requests.post(postUrl, data=json.dumps(data_1)).text
    html = json.loads(html)
    jingqu = data[1]
    comments = html['data']['comments']
    pages = html['data']['totalpage']
    datas = []
    for j in range(pages):
        data1 = {
            "pageid": "10650000804",
            "viewid": data[0],
            "tagid": "0",
            "pagenum": str(j + 1),
            "pagesize": "10",
            "contentType": "json",
            "SortType": "1",
            "head": {
                "appid": "100013776",
                "cid": "09031164110643039198",
                "ctok": "",
                "cver": "1.0",
                "lang": "01",
                "sid": "8888",
                "syscode": "09",
                "auth": "",
                "extension": [
                    {
                        "name": "protocal",
                        "value": "https"
                    }
                ]
            },
            "ver": "7.10.3.0319180000"
        }
        datas.append(data1)

    scores = []
    contents = []
    times1 = []
    jingqus = []
    for k in datas:
        print('正在抓取第' + k['pagenum'] + "页")
        time.sleep(3)
        html1 = requests.post(postUrl, data=json.dumps(k)).text
        html1 = json.loads(html1)
        comments = html1['data']['comments']

        for i in comments:
            score = i['score']
            time1 = i['date']
            scores.append(score)
            times1.append(time1)
            jingqus.append(jingqu)

    pf = pd.DataFrame({'comments':contents,'time': times1,'score': scores, 'jingqu':jingqus})
    pf.to_csv("ctr_scores.csv", encoding="utf-8",
              header=True, index=False)

 爬虫代码参考CSDN的一位博主,我今天懒得找出处了,哪天找到补上

第二部分 携程评分可视化

一、准备 


library(lubridate)
library(ggplot2)
library(reshape2)
library(knitr)

# 批量读取
path<-"C:\\Users\\91333\\Documents\\semester6\\VS code\\VScode Python\\scores"
fileNames<-dir(path) 
filePath<-sapply(fileNames, function(x){ 
  paste(path,x,sep='/')})   
scores_data<-lapply(filePath, function(x){
  read.csv(x, header=T)})
scores_data<-lapply(scores_data,function(x){
   data.frame(time=as.POSIXlt(substr(x$time,1,10)),score=x$score)})
scores_data<-lapply(scores_data,function(x){
   
# 整合一下数据 
data.frame(year=year(x$time),month=month(x$time),wday=wday(x$time),yearmonth=substr(x$time,1,7),time=x$time,score=x$score)})
freqall<-lapply(scores_data, function(x){
  data.frame(yearmonth=ymd(paste(names(table(x$yearmonth)),'-01')),freq = as.numeric(table(x$yearmonth)))
})
for (i in 1:length(freqall)){
  colnames(freqall[[i]]) <- c("yearmonth",fileNames[i])
}
merged <-  merge(freqall[[1]],freqall[[2]],all=TRUE)
for(i in 3:length(freqall)){
  merged <-merge(merged,freqall[[i]],all=TRUE)
}
for (i in 1:ncol(merged)) {
  useless <- colnames(merged)[i]
  colnames(merged)[i] <- substr(useless,1,nchar(useless)-4)
}
colnames(merged)[1]<-'时间'
merged[is.na(merged)]<-0
merged <- merged[32:116,]

 二、绘制分地区条形图

1.准备

barplotdata <- data.frame('景点'=colnames(merged)[2:19],'评论数'=as.numeric(colSums(merged[,2:19])))
barplotdata <- data.frame(t(as.data.frame(lapply(scores_data, function(x){
  c(mean(x$score),var(x$score),median(x$score))
}))),barplotdata)
colnames(barplotdata)[1:3]<-c('均值','方差','中位数')
barplotdata<-barplotdata[order(barplotdata[,'均值'],decreasing = T),]
barplotdata[19,c(1,2,3,5)] <- colMeans(barplotdata[,c(1,2,3,5)])
barplotdata[19,4]<-99999
rownames(barplotdata)<- t(barplotdata['景点'])
rownames(barplotdata)[19]<-'平均'
kable(barplotdata[,c(1,2,3,5)])

 

 

均值

方差

中位数

评论数

九寨沟

4.565448

1.4860773

5.000000

3010.0000

黄龙

4.555615

0.7628229

5.000000

1834.0000

鹧鸪山滑雪场

4.506250

1.2108168

5.000000

480.0000

映秀地震遗址

4.463636

0.7703611

5.000000

220.0000

卧龙

4.404348

0.9385431

5.000000

460.0000

太子岭滑雪场

4.379310

1.4888438

5.000000

290.0000

达古冰川

4.352273

1.5276399

5.000000

879.0000

四姑娘山

4.350526

1.7537142

5.000000

942.0000

红原-若尔盖草原

4.322222

1.6162632

5.000000

176.0000

叠溪-松坪沟

4.306000

1.2268176

5.000000

500.0000

中国古羌城

4.275000

0.9427673

4.000000

160.0000

毕棚沟

4.190909

1.9315108

5.000000

2530.0000

松潘古城

4.171429

1.0422420

4.000000

210.0000

若尔盖花湖

4.133962

2.0633056

5.000000

529.0000

九曲黄河第一湾

4.067442

2.2355342

5.000000

428.0000

桃坪羌寨

4.034043

1.9050220

5.000000

469.0000

卓克基土司官寨

3.925000

2.4597484

5.000000

160.0000

双桥沟

3.916667

2.9302650

5.000000

240.0000

平均

4.273338

1.5717942

4.888889

750.9444

ggplot(barplotdata[1:18,])
+ geom_bar(aes(x=reorder(景点,-均值),y=均值-3.9,fill=reorder(景点,-均值)),stat ="identity")
+ xlab("景点")
+ ggtitle('阿坝州主要景点2013年1月至2020年4月携程网评分均值得分图')
+ guides(fill=guide_legend(title=NULL))+scale_fill_hue(h = c(0,180), c=60,l=70)
+ scale_y_continuous(breaks=c(0,0.2,0.4,0.6),labels = c("3.9","4.1","4.3","4.7"))
+ ylab('评分均值')
+ theme_classic()
+ theme(plot.title = element_text(hjust = 0.5), legend.key.size = unit(15, "pt"), axis.text.x  = element_blank())

 

for(i in 1:length(scores_data)){
     scores_data[[i]]['景区'] <- colnames(merged)[i+1]
}
newdata<-rbind(scores_data[[1]],scores_data[[2]])
for(i in 3:length(scores_data)){
  newdata <- rbind(newdata,scores_data[[i]])
}

#for(i in 1:length(scores_data)){
#  for(j in 1:length(scores_data)){
#    if(i ==j){ scores_data[[i]][colnames(merged)[i+1]] <- 1
#    }
#    else{scores_data[[i]][colnames(merged)[j+1]] <- 0
#    }
#  }
#}
#newdata<-rbind(scores_data[[1]],scores_data[[2]])
#for(i in 3:length(scores_data)){
#  newdata <- rbind(newdata,scores_data[[i]])
#}

ggplot(newdata[,],aes(x=reorder(景区,-score,FUN=mean),y=score))
+ geom_boxplot(fill="lightblue",alpha=0.4,color = '#46A3FF')
+ geom_violin(color='red',fill="pink",alpha=0.4)
+ theme_classic()
+ ylab('携程评分')
+ xlab('景点')
+ ggtitle('阿坝州主要景点2013年1月至2020年4月携程网评分小提琴图')
+ theme(plot.title = element_text(hjust = 0.5))

time_score<-NULL
for(i in 1:length(scores_data)){
useless <-tapply(scores_data[[i]]$score,scores_data[[i]]$yearmonth,mean)
useless <-data.frame(names(useless),as.numeric(useless))
colnames(useless)<-c('时间',colnames(merged)[i+1])
useless['时间']<-ymd(paste(useless$时间,'-01'))
time_score <- merge(useless,time_score,all = T)
}
time_score[is.na(time_score)]<-0
time_score['主要景点平均评分']=rowMeans(time_score[,2:19])

ggplot(time_score[32:116,], aes(x=时间,y=主要景点平均评分))
+ geom_line()
+ theme_classic()
+ ggtitle('阿坝州主要景点2013年1月至2020年4月携程网评分均值折线图')
+ theme(plot.title = element_text(hjust = 0.5))
#ggplot(scores_data[[1]])+geom_line(aes(x=yearmonth,y=score))

year_score<-NULL
for(i in 1:length(scores_data)){
useless <-tapply(scores_data[[i]]$score,scores_data[[i]]$year,mean)
useless <-data.frame(year(paste(names(useless),'-01-01',sep='')),as.numeric(useless))
colnames(useless)<-c('年份',colnames(merged)[i+1])
year_score <- merge(useless,year_score,all = T)
}
year_score<-year_score[11:18,]
#year_score[is.na(year_score)]<-0
year_score2 <- melt(year_score, id.vars="年份")
colnames(year_score2)[2]<-'景点'

ggplot(year_score2[c(1:16,25:32,41:48,57:64),], aes(x=年份, y=value)) + geom_line(aes(color=景点),size=1)+guides(fill=guide_legend(title=NULL))+theme(plot.title = element_text(hjust = 0.5))+theme_classic()+ylab('评分')+ggtitle('阿坝州人文类景点2013年至2020年携程网评分年度均值折线图')

ggplot(year_score2[c(17:24,49:56),], aes(x=年份, y=value)) + geom_line(aes(color=景点),size=1)+guides(fill=guide_legend(title=NULL))+theme(plot.title = element_text(hjust = 0.5))+theme_classic()+ylab('评分')+ggtitle('阿坝州热门滑雪场2013年至2020年携程网评分年度均值折线图')

ggplot(year_score2[c(65:114,33:40),], aes(x=年份, y=value)) + geom_line(aes(color=景点),size=1)+guides(fill=guide_legend(title=NULL))+theme(plot.title = element_text(hjust = 0.5))+theme_classic()+ylab('评分')+ggtitle('阿坝州自然类景点2013年至2020年携程网评分年度均值折线图')

 

 

 

 

 

newdata$season <- ifelse(newdata$month==3|newdata$month==4|newdata$month==5,"春季",newdata$month) 
 newdata$season <- ifelse(newdata$month==6|newdata$month==7|newdata$month==8,"夏季",newdata$season) 
newdata$season <- ifelse(newdata$month==9|newdata$month==10|newdata$month==11,"秋季",newdata$season) 
 newdata$season <- ifelse(newdata$month==12|newdata$month==1|newdata$month==2,"冬季",newdata$season) 
 

ggplot(newdata[,],aes(x=reorder(season,-score,FUN=mean),y=score))+geom_boxplot(fill="lightblue",alpha=0.4,color = '#46A3FF')+geom_violin(color='red',fill="pink",alpha=0.4)+theme_classic()+ylab('携程评分')+xlab('季节')+ggtitle('阿坝州主要景点2013至2020年携程网季节评分小提琴图')+theme(plot.title = element_text(hjust = 0.5))

yearmonth_times<- data.frame(as.numeric(table(newdata$yearmonth)),ymd(paste(names(table(newdata$yearmonth)),'-01',sep='')))
colnames(yearmonth_times)<-c('评论数','时间')

ggplot(yearmonth_times[32:116,],aes(x=时间,y=评论数))+geom_line()+theme_classic()+ggtitle('阿坝州主要景点2013至2020年携程网评论数折线图')+theme(plot.title = element_text(hjust = 0.5))

 

 

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值