php yield 携程,爬取携程国内地区攻略评论

这里我们用scrapy 框架爬取数据

爬虫文件里面的代码:

# -*- coding: utf-8 -*-

import json

import re

import os

import scrapy

from copy import deepcopy

from scrapy_redis.spiders import RedisSpider

class ItcastSpider(scrapy.Spider):

# 爬虫名称, 运行爬虫的时候需要用到, 必须唯一

name = ‘Ctrip‘

# 起始的URL列表, 爬虫从这些URL开始爬取

start_urls = [‘http://you.ctrip.com/place/‘]

def parse(self, response):

#获取所有地区的url 和名字

url_list = response.xpath(‘//*[@id="journals-panel-items"]/dl[2][email protected]).extract()

name_list = response.xpath(‘//*[@id="journals-panel-items"]/dl[2]/dd/ul/li/a/text()‘).extract()

i = 0

for  url in url_list:

#国内每个地方的文件夹

os.makedirs(‘路径‘+name_list[i])

yield scrapy.Request(‘http://you.ctrip.com‘ + url, callback=self.parse_data,meta={‘text_name‘:deepcopy(name_list[i])})

i += 1

def parse_data(self,response):

os_name = response.meta[‘text_name‘]

url = response.xpath(‘/html/body/div[3]/div/div/ul/li[3][email protected]).extract_first()

yield scrapy.Request(‘http://you.ctrip.com‘ + url, callback=self.parses,meta={‘text_name‘:deepcopy(os_name)})

def parses(self, response):

os_name = response.meta[‘text_name‘]

id = None

try:

id = response.meta[‘id‘]

id += 1

except:

pass

url_list = re.findall(""".*?""",

response.text)  # 返回url列表

# 给所有的景点url补充前缀http://you.ctrip.com,并添加到总的景点url列表:url_total

su = 0

for url in url_list:

fi = url.find(‘http‘)  # 如果能找到,返回0。否则返回-1

if fi:

su += 1

yield scrapy.Request(‘http://you.ctrip.com‘ + url, callback=self.parse_data_list,meta={‘text_name‘:deepcopy(os_name)})

else:

pass

if not id:

id = 2

if su != 0:

url_s = ‘http://you.ctrip.com/sightlist/guilin28/s0-p{}.html‘.format(id)

yield scrapy.Request(url_s, callback=self.parses, meta={‘id‘: id,‘text_name‘:deepcopy(os_name)})

def parse_data_list(self,response):

os_name = response.meta[‘text_name‘]

name = re.findall(‘

(.*?)

‘, response.text)[0]

id = re.findall(‘var poiid = "(.*?)"‘, response.text)

id = str(id[0])

data = {  # POST请求参数

‘poiID‘: id,  # 景点评论页的poiID

‘pagenow‘: ‘1‘,  # 评论页的页码

}

url = ‘http://you.ctrip.com/destinationsite/TTDSecond/SharedView/AsynCommentView‘  # 获取携程评论的url。跟景点url不一样

yield scrapy.FormRequest(url=url,

formdata=data ,

callback=self.parse_page,

meta={‘name‘: name,‘data‘:data,‘text_name‘:os_name}

)

def parse_page(self,response):

os_name = response.meta[‘text_name‘]

name = response.meta[‘name‘]

data = response.meta[‘data‘]

data[‘pagenow‘] = str(int(data[‘pagenow‘])+1)

url = ‘http://you.ctrip.com/destinationsite/TTDSecond/SharedView/AsynCommentView‘

list_comment = re.findall(‘(.*?)

response.text)  # 提取评论,返回当前页的评论列表

if list_comment:

#写入对应的地区

with open(‘路径‘+os_name+‘/‘ + name + ‘.txt‘, ‘a‘) as f:

for i in list_comment:

f.write(i.replace(‘ ‘, ‘‘).replace(‘ ‘, ‘‘).replace(‘&‘, ‘‘).replace(‘quot;‘, ‘‘))

f.write(‘\n‘)

f.write(‘\n‘)

yield scrapy.FormRequest(url=url,

formdata=data,

callback=self.parse_page,

meta={‘name‘: name, ‘data‘: data,‘text_name‘:os_name}

)

else:

pass

更多技术咨询可关注:gzitcast

原文:https://www.cnblogs.com/heimaguangzhou/p/11590613.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值