#网址:https://movie.douban.com/top250
#列表页分析:
第一页:https://movie.douban.com/top250?start=0&filter=
第二页:https://movie.douban.com/top250?start=25&filter=
第三页:https://movie.douban.com/top250?start=50&filter=
最后一页:https://movie.douban.com/top250?start=225&filter=
请求方式 get
参数修改start
返回数据为html
1.1 创建项目
scrapy startproject doubanSpider
cd ssqSpider
scrapy genspider douban douban.com
1.2 创建爬虫
scrapy genspider douban "douban.com"
1.3 添加爬虫模型
import scrapy
class DoubanMovieItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field() #标题
stars = scrapy.Field() #分数
subject = scrapy.Field() #主题
pass
1.4 修改爬虫代码,以列表页的一页数据为例子
1.5.1 数据为html文档时使用Selector或者response.css
import scrapy
from scrapy.http import HtmlResponse
from scrapy import Request,Selector
from ssqSpider.items import DoubanMovieItem
class DoubanSpider(scrapy.Spider):
name = "douban"
allowed_domains = ["douban.com"]
start_urls = ["https://movie.douban.com/top250?start=0&filter="]
def parse(self, response:HtmlResponse):
##print(response.text)
# sel=Selector(response)
# list_items=sel.css('#content > div > div.article > ol > li')
list_items=response.css('#content > div > div.article > ol > li')
for list_item in list_items:
item=DoubanMovieItem()
item['title']=list_item.css('span.title::text').extract_first()
item['stars']=list_item.css('span.rating_num::text').extract_first()
item['subject']=list_item.css('span.inq::text').extract_first()
yield item
pass
1.5.2 数据为html文档时使用response.xpath
import scrapy
from scrapy.http import HtmlResponse
from scrapy import Request,Selector
from ssqSpider.items import DoubanMovieItem
class DoubanSpider(scrapy.Spider):
name = "douban"
allowed_domains = ["douban.com"]
start_urls = ["https://movie.douban.com/top250?start=0&filter="]
def parse(self, response:HtmlResponse):
# #print(response.text)
# # 1使用Selector
# # sel=Selector(response)
# # list_items=sel.css('#content > div > div.article > ol > li')
# list_items=response.css('#content > div > div.article > ol > li')
# for list_item in list_items:
# item=DoubanMovieItem()
# item['title']=list_item.css('span.title::text').extract_first()
# item['stars']=list_item.css('span.rating_num::text').extract_first()
# item['subject']=list_item.css('span.inq::text').extract_first()
# yield item
# 2使用xpath
list_items=response.xpath('//*[@id="content"]/div/div[1]/ol/li')
for list_item in list_items:
item=DoubanMovieItem()
item['title']=list_item.xpath('//span[@class="title"]/text()').extract_first()
item['stars']=list_item.xpath('//span[@class="rating_num"]/text()').extract_first()
item['subject']=list_item.xpath('//span[@class="inq"]/text()').extract_first()
yield item
pass