方法1.
1 from scrapy.selector import HtmlXPathSelector 2 3 def parse(self, response): 4 hxs = HtmlXPathSelector(response) 5 items = [] 6 7 newurls = hxs.select('//a/@href').extract() 8 validurls = [] 9 for url in newurls: 10 #判断URL是否合法 11 if true: 12 validurls.append(url) 13 items.extend([self.make_requests_from_url(url).replace(callback=self.parse) for url in validurls]) 15 16 sites = hxs.select('//ul/li') 17 items = [] 18 for site in sites: 19 item = DmozItem() 20 item['title'] = site.select('a/text()').extract() 21 item['link'] = site.select('a/@href').extract() 22 item['desc'] = site.select('text()').extract() 23 items.append(item) 24 25 return items
方法2.
1 from scrapy.selector import HtmlXPathSelector
2 from sitemap.items import SitemapItem
3
4 import urllib
5 import simplejson
6 import exceptions
7 import pickle
8
9 class SitemapSpider(CrawlSpider):
10 name = 'sitemap_spider'
11 allowed_domains = ['qunar.com']
12 start_urls = ['http://www.qunar.com/routes/']
13
14 rules = (
15 #Rule(SgmlLinkExtractor(allow=(r'http://www.qunar.com/routes/.*')), callback='parse'),
16 #Rule(SgmlLinkExtractor(allow=('http:.*/routes/.*')), callback='parse'),
17 )
18
19 def parse(self, response):
20 item = SitemapItem()
21 x = HtmlXPathSelector(response)
22 raw_urls = x.select("//a/@href").extract()
23 urls = []
24 for url in raw_urls:
25 if 'routes' in url:
26 if 'http' not in url:
27 url = 'http://www.qunar.com' + url
28 urls.append(url)
29
30 for url in urls:
31 yield Request(url)
32
33 item['url'] = response.url.encode('UTF-8')
34 arr_keywords = x.select("//meta[@name='keywords']/@content").extract()
35 item['keywords'] = arr_keywords[0].encode('UTF-8')
36 arr_description = x.select("//meta[@name='description']/@content").extract()
37 item['description'] = arr_description[0].encode('UTF-8')
38
39 yield item
关于rule.
rules = ( #下面是符合规则的网址,但是不抓取内容,只是提取该页的链接(这里网址是虚构的,实际使用时请替换) Rule(SgmlLinkExtractor(allow=(r'http://test_url/test?page_index=\d+'))), #下面是符合规则的网址,提取内容,(这里网址是虚构的,实际使用时请替换) Rule(SgmlLinkExtractor(allow=(r'http://test_rul/test?product_id=\d+')), callback="parse_item"), )