scrapy爬取某网站景区评论爬虫

wen0220

于 2019-11-04 10:17:17 发布

阅读量1.2k

点赞数 2

分类专栏： python

本文链接：https://blog.csdn.net/qq_34485930/article/details/102891601

版权

python 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

step1.研究网页结构，每个景点有一个景区的超“链接” https://piao.ctrip.com/ticket/dest/t2286.html

step2.链接到景区后，评论，在scrapy shell中不显示。推测应该是ajax等的发起的请求。

找到的地址是：https://sec-m.ctrip.com/restapi/soa2/12530/json/viewCommentList
请求体中包含，景区的viewid，就是景区链接里2286，其他就是一些分页等的内容，可以自己设定。

step3.计划这个爬虫分2步

爬取景点的code
根据code爬取景区的评论

step4.源码放到git上了：https://github.com/wenwen0220/xiechengDemo

主要代码如下：

爬取code：

import scrapy
from xiechengDemo.items import SceneryCodeItem
import random
import re
#爬取景区的code
class SceneryCodeSpider(scrapy.Spider):
	name = "sceneryCode"
	#要爬取的url集合
	# start_urls = ['https://you.ctrip.com/sightlist/shandong100039/s0-p2.html']
	#可以直接读取文件
	start_urls=[i.strip() for i in open('/Users/jw/python/xiechengDemo/urls.txt').readlines()]

	def parse(slf,response):
		# print(response)
		#用xpath获取需要的内容
		sceneryName_list=response.xpath('.//*[@class="list_mod2"]/div[2]/dl/dt/a/text()').extract()
		#获取景区的url连接地址
		sceneryUrl_list=response.xpath('.//*[@class="list_mod2"]/div[2]/dl/dt/a/@href').extract()
		# print(sceneryName_list)
		list=[]

		for i,j in zip(sceneryName_list,sceneryUrl_list):
			#将url切分，获取景区code与城市名称
			uri=j.split("/")
			sceneryItem=SceneryCodeItem()
			# item['_id']=str(random.randint(1,1000))
			sceneryItem['provinceName']= "shandong"
			#获取所有非数字的，正则表达式（qingdao）
			sceneryItem['cityName']= re.findall("\D+",uri[2])[0]
			sceneryItem['sceneryName']=i
			#获取所有数字的，正则表达式（1234）
			sceneryItem['sceneryCode']=re.findall("\d+",uri[3])[0]
			print(sceneryItem)
			yield sceneryItem
		# 	list.append(sceneryItem)
		# return list

爬取评论

import scrapy
from xiechengDemo.items import SceneryCommentsItem
import random
import json
import re
import datetime
from datetime import date

#根据景区的id爬取景区的评论
class SceneryCommentSpider(scrapy.Spider):
	name = "sceneryComment"

	def start_requests(self):

		postUrl="https://sec-m.ctrip.com/restapi/soa2/12530/json/viewCommentList"
		for data in self.getBody():
			#FormRequest方法的content-type默认是“application/x-www-form-urlencoded”，请求会返回空，用下边的方法替换。
			# yield scrapy.FormRequest(url=postUrl,formdata=data,callback=self.parse) 
			yield scrapy.Request(
				postUrl, 
				body=json.dumps(data[0]), 
				method='POST', 
				headers={'Content-Type': 'application/json'},
				callback=lambda response,sceneryCode=data[1],sceneryName=data[2]: self.parse(response,sceneryCode,sceneryName))

	def parse(slf,response,sceneryCode,sceneryName):
		# print(response.text)

		# date=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
		#获取今天的时间
		# today = date.today()
		beginDate=date(2019,1,1)


		jsonArray=json.loads(response.body)['data']['comments']
		for i in jsonArray:
			#评论日期
			# commentDate=datetime.datetime.strptime(i['date'],'%Y-%m-%d')
			#获取年-月-日，格式是str
			commentDateStr=datetime.datetime.strptime(i['date'], '%Y-%m-%d %H:%M').strftime('%Y-%m-%d')
			#str转换成datetime
			b=datetime.datetime.strptime(commentDateStr,'%Y-%m-%d')
			#datetime转换成date
			commentDate=datetime.datetime.date(b)
			# print("------is",commentDate)
			#不是2019年的就跳出
			if commentDate<beginDate :
				continue

			sceneryCommentsItem=SceneryCommentsItem()
			sceneryCommentsItem['id']=i['id']
			sceneryCommentsItem['uid']=i['uid']
			sceneryCommentsItem['title']=i['title']
			sceneryCommentsItem['content']=i['content']
			sceneryCommentsItem['date']=i['date']
			sceneryCommentsItem['score']=i['score']
			sceneryCommentsItem['sceneryCode']=sceneryCode
			sceneryCommentsItem['sceneryName']=sceneryName
			yield sceneryCommentsItem

	#获取body的方法
	def getBody(self):
		# f=open("/Users/didi/jw/python/xiechengDemo/sceneryCode.json")
		# res=f.read
		# jsonArray=json.load(res)
		#读取json文件
		listData=[]
		with open('/Users/jw/python/xiechengDemo/sceneryCode.json','r') as f:
			#直接用load方法
			jsonArray=json.load(f)
		for i in jsonArray:
			# print(i['sceneryCode'])
			#请求的内容根据自己要爬取的页面数，与页面size自定义
			data={
				"pageid": "10650000804",
			    "viewid": i['sceneryCode'],
			    "tagid": "0",
			    "pagenum": "1",
			    "pagesize": "50",
			    "contentType": "json",
			    "head": {
			        "appid": "100013776",
			        "cid": "09031037211035410190",
			        "ctok": "",
			        "cver": "1.0",
			        "lang": "01",
			        "sid": "8888",
			        "syscode": "09",
			        "auth": "",
			        "extension": [
			            {
			                "name": "protocal",
			                "value": "https"
			            }
			        ]
			    },
			    "ver": "7.10.3.0319180000"
			}
			list=[]
			list.append(data)
			list.append(i['sceneryCode'])
			list.append(i['sceneryName'])
			listData.append(list)
		return listData

最后的结果，写到了mysql如下：

wen0220

关注

2
点赞
踩
12

收藏

觉得还不错? 一键收藏
1
评论
scrapy爬取某网站景区评论爬虫

step1.研究网页结构，每个景点有一个景区的超“链接”https://piao.ctrip.com/ticket/dest/t2286.htmlstep2.链接到景区后，评论，在scrapy shell中不显示。推测应该是ajax等的发起的请求。找到的地址是：https://sec-m.ctrip.com/restapi/soa2/12530/json/viewCommentList...
复制链接

扫一扫

专栏目录