scrapy爬虫爬取旅游问答网站，递归爬取下一页

最新推荐文章于 2020-11-24 08:58:55 发布

疯颠颠_

最新推荐文章于 2020-11-24 08:58:55 发布

阅读量2.7k

点赞数

分类专栏： python

本文链接：https://blog.csdn.net/huang_mao_mao/article/details/24313681

版权

python 专栏收录该内容

17 篇文章 0 订阅

订阅专栏

class BusSpider(BaseSpider):
    #设置爬虫名称
    name = "xinlang"
    #设置起始URL列表
    start_urls = ["http://travel.sina.com.cn/list-wenda/all/5"]

    def parse(self, response):
        req = []
        hxs = HtmlXPathSelector(response)
        how=hxs.x('/html/body/div[5]/div[1]/div[2]/div[1]/a[5]/text()').extract()
        #print 'how many--',how[0].strip()
        if int(how[0].strip())!= int(5):
            #print 'not ==',how[0].strip()
            nn=hxs.x('/html/body/div[5]/div[1]/div[2]/div[1]/a[9]/@href').extract()
            #print '------',len(nn)
            n=Request(url="http://travel.sina.com.cn" + nn[0].strip(),callback=self.parse)
            yield n
        cat_urls =  hxs.x('/html/body/div[5]/div[1]/div[1]/ul/li/a/@href').extract()
        print 'cat_urls =', cat_urls
        for url in cat_urls:
            #构建新的URL
            new_url = "http://travel.sina.com.cn" + url
            print "[parse]new_url = %s" % (new_url)
            #创建对应的页面的Request对象，设定回调函数为parse_cat，利用parse_cat处理返回的页面
            r = Request(new_url, callback=self.parse_cat)
            yield r
        return


    def parse_cat(self, response):
        hxs = HtmlXPathSelector(response)
        title = hxs.x('//h1[@id="artibodyTitle"]/text()').extract()
        des= hxs.x('/html/body/div[5]/div[1]/div[1]/p/text()').extract()
        ans=hxs.x('/html/body/div[5]/div[1]/div[2]/ul/li/p/text()').extract()
#结果写入到记录的文件之中
        print "#####"
        global SUM
        SUM+=1
        fp = codecs.open('record'+str(SUM)+'.txt', 'w', 'utf-8') 
        if filter_tags(title[0].strip()):
            print 'title =',filter_tags(title[0].strip())
            fp.write(filter_tags(title[0].strip()))
            fp.write('\r\n')
        else:
            fp.write('&')
            fp.write('\r\n')
        if filter_tags(des[0].strip()):
            print 'des =',filter_tags(des[0].strip())
            fp.write(filter_tags(des[0].strip()))
            fp.write('\r\n')
        else:
            fp.write('&')
            fp.write('\r\n')
        a_num=0
        for ansl in ans:
            if filter_tags(ansl.strip()):
                a_num+=1
                an_two=filter_tags(ansl.strip())
                print 'ans =',an_two
                fp.write(an_two)
                fp.write('\r\n')
            else:
            	fp.write('&')
            	fp.write('\r\n')
        print "#####"

疯颠颠_

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
scrapy爬虫爬取旅游问答网站，递归爬取下一页

class BusSpider(BaseSpider): #设置爬虫名称 name = "xinlang" #设置起始URL列表 start_urls = ["http://travel.sina.com.cn/list-wenda/all/5"] def parse(self, response): req = [] hx
复制链接

扫一扫