class BusSpider(BaseSpider):
#设置爬虫名称
name = "xinlang"
#设置起始URL列表
start_urls = ["http://travel.sina.com.cn/list-wenda/all/5"]
def parse(self, response):
req = []
hxs = HtmlXPathSelector(response)
how=hxs.x('/html/body/div[5]/div[1]/div[2]/div[1]/a[5]/text()').extract()
#print 'how many--',how[0].strip()
if int(how[0].strip())!= int(5):
#print 'not ==',how[0].strip()
nn=hxs.x('/html/body/div[5]/div[1]/div[2]/div[1]/a[9]/@href').extract()
#print '------',len(nn)
n=Request(url="http://travel.sina.com.cn" + nn[0].strip(),callback=self.parse)
yield n
cat_urls = hxs.x('/html/body/div[5]/div[1]/div[1]/ul/li/a/@href').extract()
print 'cat_urls =', cat_urls
for url in cat_urls:
#构建新的URL
new_url = "http://travel.sina.com.cn" + url
print "[parse]new_url = %s" % (new_url)
#创建对应的页面的Request对象,设定回调函数为parse_cat,利用parse_cat处理返回的页面
r = Request(new_url, callback=self.parse_cat)
yield r
return
def parse_cat(self, response):
hxs = HtmlXPathSelector(response)
title = hxs.x('//h1[@id="artibodyTitle"]/text()').extract()
des= hxs.x('/html/body/div[5]/div[1]/div[1]/p/text()').extract()
ans=hxs.x('/html/body/div[5]/div[1]/div[2]/ul/li/p/text()').extract()
#结果写入到记录的文件之中
print "#####"
global SUM
SUM+=1
fp = codecs.open('record'+str(SUM)+'.txt', 'w', 'utf-8')
if filter_tags(title[0].strip()):
print 'title =',filter_tags(title[0].strip())
fp.write(filter_tags(title[0].strip()))
fp.write('\r\n')
else:
fp.write('&')
fp.write('\r\n')
if filter_tags(des[0].strip()):
print 'des =',filter_tags(des[0].strip())
fp.write(filter_tags(des[0].strip()))
fp.write('\r\n')
else:
fp.write('&')
fp.write('\r\n')
a_num=0
for ansl in ans:
if filter_tags(ansl.strip()):
a_num+=1
an_two=filter_tags(ansl.strip())
print 'ans =',an_two
fp.write(an_two)
fp.write('\r\n')
else:
fp.write('&')
fp.write('\r\n')
print "#####"
scrapy爬虫爬取旅游问答网站,递归爬取下一页
最新推荐文章于 2020-11-24 08:58:55 发布