-- coding: utf-8 --
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class WxSpider(CrawlSpider):
name = ‘wx’
allowed_domains = [‘wxapp-union.com’]
start_urls = [‘http://www.wxapp-union.com/portal.php?mod=list&catid=1&page=1’]
rules = (
#列表页
Rule(LinkExtractor(allow=r'http://www.wxapp-union.com/portal.php\?mod=list&catid=1&page=\d+'), follow=True),
#详情页
Rule(LinkExtractor(allow=r'http://www.wxapp-union.com/article-\d+-1.html'), callback='parse_item')
)
def parse_item(self, response):
item = {}
#item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
#item['name'] = response.