step1.研究网页结构,每个景点有一个景区的超“链接” https://piao.ctrip.com/ticket/dest/t2286.html
step2.链接到景区后,评论,在scrapy shell中不显示。推测应该是ajax等的发起的请求。
- 找到的地址是:https://sec-m.ctrip.com/restapi/soa2/12530/json/viewCommentList
- 请求体中包含,景区的viewid,就是景区链接里2286,其他就是一些分页等的内容,可以自己设定。
step3.计划这个爬虫分2步
- 爬取景点的code
- 根据code爬取 景区的评论
step4.源码放到git上了:https://github.com/wenwen0220/xiechengDemo
主要代码如下:
爬取code:
import scrapy
from xiechengDemo.items import SceneryCodeItem
import random
import re
#爬取景区的code
class SceneryCodeSpider(scrapy.Spider):
name = "sceneryCode"
#要爬取的url集合
# start_urls = ['https://you.ctrip.com/sightlist/shandong100039/s0-p2.html']
#可以直接读取文件
start_urls=[i.strip() for i in open('/Users/jw/python/xiechengDemo/urls.txt').readlines()]
def parse(slf,response):
# print(response)
#用xpath获取需要的内容
sceneryName_list=response.xpath('.//*[@class="list_mod2"]/div[2]/dl/dt/a/text()').extract()
#获取景区的url连接地址
sceneryUrl_list=response.xpath('.//*[@class="list_mod2"]/div[2]/dl/dt/a/@href').extract()
# print(sceneryName_list)
list=[]
for i,j in zip(sceneryName_list,sceneryUrl_list):
#将url切分,获取景区code与城市名称
uri=j.split("/")
sceneryItem=SceneryCodeItem()
# item['_id']=str(random.randint(1,1000))
sceneryItem['provinceName']= "shandong"
#获取所有非数字的,正则表达式(qingdao)
sceneryItem['cityName']= re.findall("\D+",uri[2])[0]
sceneryItem['sceneryName']=i
#获取所有数字的,正则表达式(1234)
sceneryItem['sceneryCode']=re.findall("\d+",uri[3])[0]
print(sceneryItem)
yield sceneryItem
# list.append(sceneryItem)
# return list
爬取评论
import scrapy
from xiechengDemo.items import SceneryCommentsItem
import random
import json
import re
import datetime
from datetime import date
#根据景区的id爬取景区的评论
class SceneryCommentSpider(scrapy.Spider):
name = "sceneryComment"
def start_requests(self):
postUrl="https://sec-m.ctrip.com/restapi/soa2/12530/json/viewCommentList"
for data in self.getBody():
#FormRequest方法的content-type默认是“application/x-www-form-urlencoded”,请求会返回空,用下边的方法替换。
# yield scrapy.FormRequest(url=postUrl,formdata=data,callback=self.parse)
yield scrapy.Request(
postUrl,
body=json.dumps(data[0]),
method='POST',
headers={'Content-Type': 'application/json'},
callback=lambda response,sceneryCode=data[1],sceneryName=data[2]: self.parse(response,sceneryCode,sceneryName))
def parse(slf,response,sceneryCode,sceneryName):
# print(response.text)
# date=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
#获取今天的时间
# today = date.today()
beginDate=date(2019,1,1)
jsonArray=json.loads(response.body)['data']['comments']
for i in jsonArray:
#评论日期
# commentDate=datetime.datetime.strptime(i['date'],'%Y-%m-%d')
#获取年-月-日,格式是str
commentDateStr=datetime.datetime.strptime(i['date'], '%Y-%m-%d %H:%M').strftime('%Y-%m-%d')
#str转换成datetime
b=datetime.datetime.strptime(commentDateStr,'%Y-%m-%d')
#datetime转换成date
commentDate=datetime.datetime.date(b)
# print("------is",commentDate)
#不是2019年的就跳出
if commentDate<beginDate :
continue
sceneryCommentsItem=SceneryCommentsItem()
sceneryCommentsItem['id']=i['id']
sceneryCommentsItem['uid']=i['uid']
sceneryCommentsItem['title']=i['title']
sceneryCommentsItem['content']=i['content']
sceneryCommentsItem['date']=i['date']
sceneryCommentsItem['score']=i['score']
sceneryCommentsItem['sceneryCode']=sceneryCode
sceneryCommentsItem['sceneryName']=sceneryName
yield sceneryCommentsItem
#获取body的方法
def getBody(self):
# f=open("/Users/didi/jw/python/xiechengDemo/sceneryCode.json")
# res=f.read
# jsonArray=json.load(res)
#读取json文件
listData=[]
with open('/Users/jw/python/xiechengDemo/sceneryCode.json','r') as f:
#直接用load方法
jsonArray=json.load(f)
for i in jsonArray:
# print(i['sceneryCode'])
#请求的内容根据自己要爬取的页面数,与页面size自定义
data={
"pageid": "10650000804",
"viewid": i['sceneryCode'],
"tagid": "0",
"pagenum": "1",
"pagesize": "50",
"contentType": "json",
"head": {
"appid": "100013776",
"cid": "09031037211035410190",
"ctok": "",
"cver": "1.0",
"lang": "01",
"sid": "8888",
"syscode": "09",
"auth": "",
"extension": [
{
"name": "protocal",
"value": "https"
}
]
},
"ver": "7.10.3.0319180000"
}
list=[]
list.append(data)
list.append(i['sceneryCode'])
list.append(i['sceneryName'])
listData.append(list)
return listData
最后的结果,写到了mysql如下: