本代码可以直接执行
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 23 01:10:10 2020
爬取携程景点评论的代码
"""
import re
import requests
import json
import time
import pandas as pd
head = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0'}
postUrl = "https://sec-m.ctrip.com/restapi/soa2/12530/json/viewCommentList"
# 如需爬取携程上不同的景点的评论,只需修改此处即可
urls = [
['1486015', '广州图书馆'],
]
for data in urls:
data_1 = {
"pageid": "10650000804",
"viewid": data[0],
"tagid": "-11",
"pagenum": "1",
"pagesize": "10",
"contentType": "json",
"SortType": "1",
"head": {
"appid": "100013776",
"cid": "09031164110643039198",
"ctok": "",
"cver": "1.0",
"lang": "01",
"sid": "8888",
"syscode": "09",
"auth": "",
"extension": [
{
"name": "protocal",
"value": "https"
}
]
},
"ver": "7.10.3.0319180000"
}
html = requests.post(postUrl, data=json.dumps(data_1)).text
html = json.loads(html)
jingqu = data[1]
# comments = html['data']['comments']
pages = html['data']['totalpage']
datas = []
for j in range(pages):
data1 = {
"pageid": "10650000804",
"viewid": data[0],
"tagid": "0",
"pagenum": str(j + 1),
"pagesize": "10",
"contentType": "json",
"SortType": "1",
"head": {
"appid": "100013776",
"cid": "09031164110643039198",
"ctok": "",
"cver": "1.0",
"lang": "01",
"sid": "8888",
"syscode": "09",
"auth": "",
"extension": [
{
"name": "protocal",
"value": "https"
}
]
},
"ver": "7.10.3.0319180000"
}
datas.append(data1)
IDs = []
jingqus = []
names = []
scores = []
contents = []
times1 = []
for k in datas:
print('正在抓取第' + k['pagenum'] + "页")
time.sleep(3)
html1 = requests.post(postUrl, data=json.dumps(k)).text
html1 = json.loads(html1)
comments = html1['data']['comments']
for i in comments:
ID = i['id']
name = i['uid']
score = i['score']
content = i['content']
#content = re.sub(" ", "", content)
time1 = i['date']
IDs.append(ID)
jingqus.append(jingqu)
names.append(name)
scores.append(score)
contents.append(content)
times1.append(time1)
# print(ID,jingqu,name,score,content,time1)
# pf = pd.DataFrame({'IDs':IDs, 'jingqus':jingqus, 'names':names, 'scores':scores,
# 'contents':contents,'times1':times1})
# pf.to_csv("pinglun.csv", encoding="utf-8-sig", header=False, index=False)
##############################################################################
pf = pd.DataFrame({'用户ID':IDs, '用户名':names, '用户评分':scores,
'评价内容':contents,'时间':times1})
pf.to_csv("广图携程评论.csv", encoding="utf-8-sig", header=True, index=False)
参考文献
[1]: https://github.com/eshinesimida/ctrip/blob/master/xiecheng.py
[2]: python携程用户评论信息爬取视频