爬取链接:https://movie.douban.com/top250
这里偷个懒,直接在之前创建好的项目文件里创建爬虫文件,创建命令“scrapy genspider douban douban.com”,创建douban.py文件
如果不知道如何创建项目文件的话,可以参考这篇文章Scrapy爬虫框架
爬虫文件的代码如下:
import scrapy
from baiduSpider.items import BaiduspiderItem
import re
class DoubanSpider(scrapy.Spider):
name = 'douban'
allowed_domains = ['douban.com']
start_urls=['https://movie.douban.com/top250']
# start_urls = ['https://movie.douban.com/top250?start='+str(i) for i in range(0, 100, 25)] #爬取链接
def parse(self, response): # 解析爬取到页面的方法
for row in response.xpath('//*[@id="content"]/div/div/ol/li/div'): # 循环逐行从列表中获取每个链接
# print(row)
item = BaiduspiderItem() # 实例化
item['rank'] = row.xpath("div[1]/em/text()").get() #排名
item['name'] = row.xpath("div[2]/div[1]/a/span[1]/text()").get() # 电影名
director= row.xpath("div[2]/div[2]/p[1]/text()[1]").get().strip() # 导演
item['director'] = re.findall(r"导演: (.*?) 主演:",director) #正则匹配,只提取导演的名字
item['score'] = row.xpath("div[2]/div[2]/div/span[2]/text()").get() # 评分
item['url'] = row.xpath("div[1]/a/img/@src").get() # 海报图url
yield item
next_url=response.xpath('//*[@id="content"]/div/div[1]/div[2]/span[3]/a')
if next_url:
next_url="https://movie.douban.com/top250"+next_url.xpath("@href").get()
yield scrapy.Request(next_url,callback=self.parse)
items文件的代码如下:
rank = scrapy.Field()
name = scrapy.Field()
director = scrapy.Field()
score = scrapy.Field()
url=scrapy.Field()
最后输入命令运行爬虫文件:scrapy crawl douban -o douban.csv
最终得到的结果如下:
顺路一哈把电影的图片进行爬取一下
爬虫文件
class DoubanSpider(scrapy.Spider):
name = 'douban2'
allowed_domains = ['douban.com']
start_urls = ['https://movie.douban.com/top250']
def parse(self, response):
img_list = []
for row in response.xpath('//*[@id="content"]/div/div[1]/ol/li/div'):
# print(row)
item = BaiduspiderItem() #实例化
item['name'] = row.xpath("div[2]/div/a/span[1]/text()").extract()[0] #获取电影名
item['img']= row.xpath("div[1]/a/img/@src").getall() #图片链接
# img_list.append(img)
# # # print()
# item['img'] = img_list
# print(item['img'])
item['url'] = row.xpath("div/a/@href").extract()[0] #电影链接
yield scrapy.Request(item['url'], meta={'item': item}, callback=self.parse_detail)
#爬取250条的
next_url = response.xpath("//span[@class='next']/a")
if next_url:
next_url = "https://movie.douban.com/top250" + next_url.xpath("@href").get()
yield scrapy.Request(next_url, callback=self.parse)
def parse_detail(self, response):
item = response.meta['item']
item['bj'] = response.xpath("//div[@id='info']/span[2]/span[2]/a/text()").get() #导演
yield item
settings.py
ITEM_PIPELINES = {
#可以设置多个,逗号分割。1为优先级,数值越小,...优先级越高
'scrapy.pipelines.images.ImagesPipeline': 1,
}
IMAGES_STORE = 'd:\\doubanimg' #图片保存路径
IMAGES_URLS_FIELD = 'img' #对应item里面设定的字段,取到图片的url
FEED_EXPORT_FIELDS = ["name","bj","img","url"]
结果: