scrapy爬取某网站景区评论爬虫

step1.研究网页结构,每个景点有一个景区的超“链接” https://piao.ctrip.com/ticket/dest/t2286.html

step2.链接到景区后,评论,在scrapy shell中不显示。推测应该是ajax等的发起的请求。

  1. 找到的地址是:https://sec-m.ctrip.com/restapi/soa2/12530/json/viewCommentList
  2. 请求体中包含,景区的viewid,就是景区链接里2286,其他就是一些分页等的内容,可以自己设定。

step3.计划这个爬虫分2步

  1. 爬取景点的code
  2. 根据code爬取 景区的评论

step4.源码放到git上了:https://github.com/wenwen0220/xiechengDemo

主要代码如下:

爬取code:

import scrapy
from xiechengDemo.items import SceneryCodeItem
import random
import re
#爬取景区的code
class SceneryCodeSpider(scrapy.Spider):
	name = "sceneryCode"
	#要爬取的url集合
	# start_urls = ['https://you.ctrip.com/sightlist/shandong100039/s0-p2.html']
	#可以直接读取文件
	start_urls=[i.strip() for i in open('/Users/jw/python/xiechengDemo/urls.txt').readlines()]

	def parse(slf,response):
		# print(response)
		#用xpath获取需要的内容
		sceneryName_list=response.xpath('.//*[@class="list_mod2"]/div[2]/dl/dt/a/text()').extract()
		#获取景区的url连接地址
		sceneryUrl_list=response.xpath('.//*[@class="list_mod2"]/div[2]/dl/dt/a/@href').extract()
		# print(sceneryName_list)
		list=[]

		for i,j in zip(sceneryName_list,sceneryUrl_list):
			#将url切分,获取景区code与城市名称
			uri=j.split("/")
			sceneryItem=SceneryCodeItem()
			# item['_id']=str(random.randint(1,1000))
			sceneryItem['provinceName']= "shandong"
			#获取所有非数字的,正则表达式(qingdao)
			sceneryItem['cityName']= re.findall("\D+",uri[2])[0]
			sceneryItem['sceneryName']=i
			#获取所有数字的,正则表达式(1234)
			sceneryItem['sceneryCode']=re.findall("\d+",uri[3])[0]
			print(sceneryItem)
			yield sceneryItem
		# 	list.append(sceneryItem)
		# return list

爬取评论 

import scrapy
from xiechengDemo.items import SceneryCommentsItem
import random
import json
import re
import datetime
from datetime import date

#根据景区的id爬取景区的评论
class SceneryCommentSpider(scrapy.Spider):
	name = "sceneryComment"

	def start_requests(self):

		postUrl="https://sec-m.ctrip.com/restapi/soa2/12530/json/viewCommentList"
		for data in self.getBody():
			#FormRequest方法的content-type默认是“application/x-www-form-urlencoded”,请求会返回空,用下边的方法替换。
			# yield scrapy.FormRequest(url=postUrl,formdata=data,callback=self.parse) 
			yield scrapy.Request(
				postUrl, 
				body=json.dumps(data[0]), 
				method='POST', 
				headers={'Content-Type': 'application/json'},
				callback=lambda response,sceneryCode=data[1],sceneryName=data[2]: self.parse(response,sceneryCode,sceneryName))

	def parse(slf,response,sceneryCode,sceneryName):
		# print(response.text)

		# date=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
		#获取今天的时间
		# today = date.today()
		beginDate=date(2019,1,1)


		jsonArray=json.loads(response.body)['data']['comments']
		for i in jsonArray:
			#评论日期
			# commentDate=datetime.datetime.strptime(i['date'],'%Y-%m-%d')
			#获取年-月-日,格式是str
			commentDateStr=datetime.datetime.strptime(i['date'], '%Y-%m-%d %H:%M').strftime('%Y-%m-%d')
			#str转换成datetime
			b=datetime.datetime.strptime(commentDateStr,'%Y-%m-%d')
			#datetime转换成date
			commentDate=datetime.datetime.date(b)
			# print("------is",commentDate)
			#不是2019年的就跳出
			if commentDate<beginDate :
				continue

			sceneryCommentsItem=SceneryCommentsItem()
			sceneryCommentsItem['id']=i['id']
			sceneryCommentsItem['uid']=i['uid']
			sceneryCommentsItem['title']=i['title']
			sceneryCommentsItem['content']=i['content']
			sceneryCommentsItem['date']=i['date']
			sceneryCommentsItem['score']=i['score']
			sceneryCommentsItem['sceneryCode']=sceneryCode
			sceneryCommentsItem['sceneryName']=sceneryName
			yield sceneryCommentsItem

	#获取body的方法
	def getBody(self):
		# f=open("/Users/didi/jw/python/xiechengDemo/sceneryCode.json")
		# res=f.read
		# jsonArray=json.load(res)
		#读取json文件
		listData=[]
		with open('/Users/jw/python/xiechengDemo/sceneryCode.json','r') as f:
			#直接用load方法
			jsonArray=json.load(f)
		for i in jsonArray:
			# print(i['sceneryCode'])
			#请求的内容根据自己要爬取的页面数,与页面size自定义
			data={
				"pageid": "10650000804",
			    "viewid": i['sceneryCode'],
			    "tagid": "0",
			    "pagenum": "1",
			    "pagesize": "50",
			    "contentType": "json",
			    "head": {
			        "appid": "100013776",
			        "cid": "09031037211035410190",
			        "ctok": "",
			        "cver": "1.0",
			        "lang": "01",
			        "sid": "8888",
			        "syscode": "09",
			        "auth": "",
			        "extension": [
			            {
			                "name": "protocal",
			                "value": "https"
			            }
			        ]
			    },
			    "ver": "7.10.3.0319180000"
			}
			list=[]
			list.append(data)
			list.append(i['sceneryCode'])
			list.append(i['sceneryName'])
			listData.append(list)
		return listData

最后的结果,写到了mysql如下:

 

  • 2
    点赞
  • 12
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值