[转]Scrapy递归抓取

最新推荐文章于 2024-09-09 23:28:21 发布

weixin_30477293

最新推荐文章于 2024-09-09 23:28:21 发布

阅读量98

点赞数

文章标签： python

原文链接：http://www.cnblogs.com/msec23/archive/2012/12/05/2803075.html

版权

方法1.

 1 from scrapy.selector import HtmlXPathSelector
 2  
 3 def parse(self, response):
 4     hxs = HtmlXPathSelector(response)
 5     items = []
 6  
 7     newurls = hxs.select('//a/@href').extract()
 8     validurls = []
 9     for url in newurls:
10             #判断URL是否合法
11             if true: 
12                     validurls.append(url)
13     items.extend([self.make_requests_from_url(url).replace(callback=self.parse) for url in validurls])
15  
16     sites = hxs.select('//ul/li')
17     items = []
18     for site in sites:
19             item = DmozItem()
20             item['title'] = site.select('a/text()').extract()
21             item['link'] = site.select('a/@href').extract()
22             item['desc'] = site.select('text()').extract()
23             items.append(item)
24  
25     return items

  方法2.

  1 from scrapy.selector import HtmlXPathSelector
 2  from sitemap.items import SitemapItem
 3  
 4  import urllib
 5  import simplejson
 6  import exceptions
 7  import pickle
 8  
 9  class SitemapSpider(CrawlSpider):
10      name = 'sitemap_spider'
11      allowed_domains = ['qunar.com']
12      start_urls = ['http://www.qunar.com/routes/']
13  
14      rules = (
15          #Rule(SgmlLinkExtractor(allow=(r'http://www.qunar.com/routes/.*')), callback='parse'),
16          #Rule(SgmlLinkExtractor(allow=('http:.*/routes/.*')), callback='parse'),
17      )
18  
19      def parse(self, response):
20          item = SitemapItem()
21          x         = HtmlXPathSelector(response)
22          raw_urls  = x.select("//a/@href").extract()
23          urls      = []
24          for url in raw_urls:
25              if 'routes' in url:
26                  if 'http' not in url:
27                      url = 'http://www.qunar.com' + url
28                  urls.append(url)
29  
30          for url in urls:
31              yield Request(url)
32  
33          item['url']         = response.url.encode('UTF-8')
34          arr_keywords        = x.select("//meta[@name='keywords']/@content").extract()
35          item['keywords']    = arr_keywords[0].encode('UTF-8')
36          arr_description     = x.select("//meta[@name='description']/@content").extract()
37          item['description'] = arr_description[0].encode('UTF-8')
38  
39          yield item

关于rule.

    rules = (  
        #下面是符合规则的网址,但是不抓取内容,只是提取该页的链接(这里网址是虚构的,实际使用时请替换)  
        Rule(SgmlLinkExtractor(allow=(r'http://test_url/test?page_index=\d+'))),  
        #下面是符合规则的网址,提取内容,(这里网址是虚构的,实际使用时请替换)  
        Rule(SgmlLinkExtractor(allow=(r'http://test_rul/test?product_id=\d+')), callback="parse_item"),  
    )