第一部分 爬虫爬取阿坝州主要景点携程评论数据
import re
import requests
import json
import time
import pandas as pd
head = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0'}
postUrl = "https://sec-m.ctrip.com/restapi/soa2/12530/json/viewCommentList"
urls = [
['1409209', '中国古羌城'],
['63315', '四姑娘山'],
['77380', '九寨沟风景区'],
['78031', '毕棚沟'],
['2673', '黄龙风景名胜区'],
['63306', '达古冰山地质公园'],
['136686', '若尔盖花湖'],
['1922979', '卧龙中华大熊猫苑神树坪基地'],
['63317', '红原—若尔盖大草原'],
['77698', '叠溪-松坪沟风景区'],
['63305', '米亚罗风景区'],
['63376', '黄河九曲第一湾'],
['136693', '双桥沟'],
['63351', '桃坪羌寨'],
['55651','松潘古城'],
['1407342','太子岭滑雪场']
['127927','映秀地震遗址'],
['1996233','鹧鸪山自然公园滑雪场']
['63371','卓克基土司官寨文化旅游景区']
]
for data in urls:
data_1 = {
"pageid": "10650000804",
"viewid": data[0],
"tagid": "-11",
"pagenum": "1",
"pagesize": "10",
"contentType": "json",
"SortType": "1",
"head": {
"appid": "100013776",
"cid": "09031164110643039198",
"ctok": "",
"cver": "1.0",
"lang": "01",
"sid": "8888",
"syscode": "09",
"auth": "",
"extension": [
{
"name": "protocal",
"value": "https"
}
]
},
"ver": "7.10.3.0319180000"
}
html = requests.post(postUrl, data=json.dumps(data_1)).text
html = json.loads(html)
jingqu = data[1]
comments = html['data']['comments']
pages = html['data']['totalpage']
datas = []
for j in range(pages):
data1 = {
"pageid": "10650000804",
"viewid": data[0],
"tagid": "0",
"pagenum": str(j + 1),
"pagesize": "10",
"contentType": "json",
"SortType": "1",
"head": {
"appid": "100013776",
"cid": "09031164110643039198",
"ctok": "",
"cver": "1.0",
"lang": "01",
"sid": "8888",
"syscode": "09",
"auth": "",
"extension": [
{
"name": "protocal",
"value": "https"
}
]
},
"ver": "7.10.3.0319180000"
}
datas.append(data1)
scores = []
contents = []
times1 = []
jingqus = []
for k in datas:
print('正在抓取第' + k['pagenum'] + "页")
time.sleep(3)
html1 = requests.post(postUrl, data=json.dumps(k)).text
html1 = json.loads(html1)
comments = html1['data']['comments']
for i in comments:
score = i['score']
time1 = i['date']
scores.append(score)
times1.append(time1)
jingqus.append(jingqu)
pf = pd.DataFrame({'comments':contents,'time': times1,'score': scores, 'jingqu':jingqus})
pf.to_csv("ctr_scores.csv", encoding="utf-8",
header=True, index=False)
爬虫代码参考CSDN的一位博主,我今天懒得找出处了,哪天找到补上
第二部分 携程评分可视化
一、准备
library(lubridate)
library(ggplot2)
library(reshape2)
library(knitr)
# 批量读取
path<-"C:\\Users\\91333\\Documents\\semester6\\VS code\\VScode Python\\scores"
fileNames<-dir(path)
filePath<-sapply(fileNames, function(x){
paste(path,x,sep='/')})
scores_data<-lapply(filePath, function(x){
read.csv(x, header=T)})
scores_data<-lapply(scores_data,function(x){
data.frame(time=as.POSIXlt(substr(x$time,1,10)),score=x$score)})
scores_data<-lapply(scores_data,function(x){
# 整合一下数据
data.frame(year=year(x$time),month=month(x$time),wday=wday(x$time),yearmonth=substr(x$time,1,7),time=x$time,score=x$score)})
freqall<-lapply(scores_data, function(x){
data.frame(yearmonth=ymd(paste(names(table(x$yearmonth)),'-01')),freq = as.numeric(table(x$yearmonth)))
})
for (i in 1:length(freqall)){
colnames(freqall[[i]]) <- c("yearmonth",fileNames[i])
}
merged <- merge(freqall[[1]],freqall[[2]],all=TRUE)
for(i in 3:length(freqall)){
merged <-merge(merged,freqall[[i]],all=TRUE)
}
for (i in 1:ncol(merged)) {
useless <- colnames(merged)[i]
colnames(merged)[i] <- substr(useless,1,nchar(useless)-4)
}
colnames(merged)[1]<-'时间'
merged[is.na(merged)]<-0
merged <- merged[32:116,]
二、绘制分地区条形图
1.准备
barplotdata <- data.frame('景点'=colnames(merged)[2:19],'评论数'=as.numeric(colSums(merged[,2:19])))
barplotdata <- data.frame(t(as.data.frame(lapply(scores_data, function(x){
c(mean(x$score),var(x$score),median(x$score))
}))),barplotdata)
colnames(barplotdata)[1:3]<-c('均值','方差','中位数')
barplotdata<-barplotdata[order(barplotdata[,'均值'],decreasing = T),]
barplotdata[19,c(1,2,3,5)] <- colMeans(barplotdata[,c(1,2,3,5)])
barplotdata[19,4]<-99999
rownames(barplotdata)<- t(barplotdata['景点'])
rownames(barplotdata)[19]<-'平均'
kable(barplotdata[,c(1,2,3,5)])
均值 | 方差 | 中位数 | 评论数 | |
---|---|---|---|---|
九寨沟 | 4.565448 | 1.4860773 | 5.000000 | 3010.0000 |
黄龙 | 4.555615 | 0.7628229 | 5.000000 | 1834.0000 |
鹧鸪山滑雪场 | 4.506250 | 1.2108168 | 5.000000 | 480.0000 |
映秀地震遗址 | 4.463636 | 0.7703611 | 5.000000 | 220.0000 |
卧龙 | 4.404348 | 0.9385431 | 5.000000 | 460.0000 |
太子岭滑雪场 | 4.379310 | 1.4888438 | 5.000000 | 290.0000 |
达古冰川 | 4.352273 | 1.5276399 | 5.000000 | 879.0000 |
四姑娘山 | 4.350526 | 1.7537142 | 5.000000 | 942.0000 |
红原-若尔盖草原 | 4.322222 | 1.6162632 | 5.000000 | 176.0000 |
叠溪-松坪沟 | 4.306000 | 1.2268176 | 5.000000 | 500.0000 |
中国古羌城 | 4.275000 | 0.9427673 | 4.000000 | 160.0000 |
毕棚沟 | 4.190909 | 1.9315108 | 5.000000 | 2530.0000 |
松潘古城 | 4.171429 | 1.0422420 | 4.000000 | 210.0000 |
若尔盖花湖 | 4.133962 | 2.0633056 | 5.000000 | 529.0000 |
九曲黄河第一湾 | 4.067442 | 2.2355342 | 5.000000 | 428.0000 |
桃坪羌寨 | 4.034043 | 1.9050220 | 5.000000 | 469.0000 |
卓克基土司官寨 | 3.925000 | 2.4597484 | 5.000000 | 160.0000 |
双桥沟 | 3.916667 | 2.9302650 | 5.000000 | 240.0000 |
平均 | 4.273338 | 1.5717942 | 4.888889 | 750.9444 |
ggplot(barplotdata[1:18,])
+ geom_bar(aes(x=reorder(景点,-均值),y=均值-3.9,fill=reorder(景点,-均值)),stat ="identity")
+ xlab("景点")
+ ggtitle('阿坝州主要景点2013年1月至2020年4月携程网评分均值得分图')
+ guides(fill=guide_legend(title=NULL))+scale_fill_hue(h = c(0,180), c=60,l=70)
+ scale_y_continuous(breaks=c(0,0.2,0.4,0.6),labels = c("3.9","4.1","4.3","4.7"))
+ ylab('评分均值')
+ theme_classic()
+ theme(plot.title = element_text(hjust = 0.5), legend.key.size = unit(15, "pt"), axis.text.x = element_blank())
for(i in 1:length(scores_data)){
scores_data[[i]]['景区'] <- colnames(merged)[i+1]
}
newdata<-rbind(scores_data[[1]],scores_data[[2]])
for(i in 3:length(scores_data)){
newdata <- rbind(newdata,scores_data[[i]])
}
#for(i in 1:length(scores_data)){
# for(j in 1:length(scores_data)){
# if(i ==j){ scores_data[[i]][colnames(merged)[i+1]] <- 1
# }
# else{scores_data[[i]][colnames(merged)[j+1]] <- 0
# }
# }
#}
#newdata<-rbind(scores_data[[1]],scores_data[[2]])
#for(i in 3:length(scores_data)){
# newdata <- rbind(newdata,scores_data[[i]])
#}
ggplot(newdata[,],aes(x=reorder(景区,-score,FUN=mean),y=score))
+ geom_boxplot(fill="lightblue",alpha=0.4,color = '#46A3FF')
+ geom_violin(color='red',fill="pink",alpha=0.4)
+ theme_classic()
+ ylab('携程评分')
+ xlab('景点')
+ ggtitle('阿坝州主要景点2013年1月至2020年4月携程网评分小提琴图')
+ theme(plot.title = element_text(hjust = 0.5))
time_score<-NULL
for(i in 1:length(scores_data)){
useless <-tapply(scores_data[[i]]$score,scores_data[[i]]$yearmonth,mean)
useless <-data.frame(names(useless),as.numeric(useless))
colnames(useless)<-c('时间',colnames(merged)[i+1])
useless['时间']<-ymd(paste(useless$时间,'-01'))
time_score <- merge(useless,time_score,all = T)
}
time_score[is.na(time_score)]<-0
time_score['主要景点平均评分']=rowMeans(time_score[,2:19])
ggplot(time_score[32:116,], aes(x=时间,y=主要景点平均评分))
+ geom_line()
+ theme_classic()
+ ggtitle('阿坝州主要景点2013年1月至2020年4月携程网评分均值折线图')
+ theme(plot.title = element_text(hjust = 0.5))
#ggplot(scores_data[[1]])+geom_line(aes(x=yearmonth,y=score))
year_score<-NULL
for(i in 1:length(scores_data)){
useless <-tapply(scores_data[[i]]$score,scores_data[[i]]$year,mean)
useless <-data.frame(year(paste(names(useless),'-01-01',sep='')),as.numeric(useless))
colnames(useless)<-c('年份',colnames(merged)[i+1])
year_score <- merge(useless,year_score,all = T)
}
year_score<-year_score[11:18,]
#year_score[is.na(year_score)]<-0
year_score2 <- melt(year_score, id.vars="年份")
colnames(year_score2)[2]<-'景点'
ggplot(year_score2[c(1:16,25:32,41:48,57:64),], aes(x=年份, y=value)) + geom_line(aes(color=景点),size=1)+guides(fill=guide_legend(title=NULL))+theme(plot.title = element_text(hjust = 0.5))+theme_classic()+ylab('评分')+ggtitle('阿坝州人文类景点2013年至2020年携程网评分年度均值折线图')
ggplot(year_score2[c(17:24,49:56),], aes(x=年份, y=value)) + geom_line(aes(color=景点),size=1)+guides(fill=guide_legend(title=NULL))+theme(plot.title = element_text(hjust = 0.5))+theme_classic()+ylab('评分')+ggtitle('阿坝州热门滑雪场2013年至2020年携程网评分年度均值折线图')
ggplot(year_score2[c(65:114,33:40),], aes(x=年份, y=value)) + geom_line(aes(color=景点),size=1)+guides(fill=guide_legend(title=NULL))+theme(plot.title = element_text(hjust = 0.5))+theme_classic()+ylab('评分')+ggtitle('阿坝州自然类景点2013年至2020年携程网评分年度均值折线图')
newdata$season <- ifelse(newdata$month==3|newdata$month==4|newdata$month==5,"春季",newdata$month)
newdata$season <- ifelse(newdata$month==6|newdata$month==7|newdata$month==8,"夏季",newdata$season)
newdata$season <- ifelse(newdata$month==9|newdata$month==10|newdata$month==11,"秋季",newdata$season)
newdata$season <- ifelse(newdata$month==12|newdata$month==1|newdata$month==2,"冬季",newdata$season)
ggplot(newdata[,],aes(x=reorder(season,-score,FUN=mean),y=score))+geom_boxplot(fill="lightblue",alpha=0.4,color = '#46A3FF')+geom_violin(color='red',fill="pink",alpha=0.4)+theme_classic()+ylab('携程评分')+xlab('季节')+ggtitle('阿坝州主要景点2013至2020年携程网季节评分小提琴图')+theme(plot.title = element_text(hjust = 0.5))
yearmonth_times<- data.frame(as.numeric(table(newdata$yearmonth)),ymd(paste(names(table(newdata$yearmonth)),'-01',sep='')))
colnames(yearmonth_times)<-c('评论数','时间')
ggplot(yearmonth_times[32:116,],aes(x=时间,y=评论数))+geom_line()+theme_classic()+ggtitle('阿坝州主要景点2013至2020年携程网评论数折线图')+theme(plot.title = element_text(hjust = 0.5))