核心spider:(入门简单参考,进阶的以后会更新,不要照抄,xpath的脚本自己去核对一下,不一定还能用)
# -*- coding: utf-8 -*-
from scrapy.selector import Selector
from scrapy.spiders import CrawlSpider,Rule
from scrapy.linkextractors.sgml import SgmlLinkExtractor
from doubanmovie.items import DoubanmoiveItem
class MoiveSpider(CrawlSpider):
name="doubanmovie"
allowed_domains=["movie.douban.com"]
start_urls=["http://movie.douban.com/top250"]
rules=[
Rule(SgmlLinkExtractor(allow=(r'http://movie.douban.com/top250\?start=\d+.*'))),
Rule(SgmlLinkExtractor(allow=(r'http://movie.douban.com/subject/\d+')),callback="parse_item"),
]
def parse_item(self,response):
sel=Selector(response)
item=DoubanmoiveItem()
item['name']=sel.xpath('//*[@id="content"]/h1/span[1]/text()').extract()
item['year']=sel.xpath('//*[@id="content"]/h1/span[2]/text()').re(r'\((\d+)\)')
return item