scrapy项目
创建新项目
scrapy startproject douban #创建一个新的项目
创建爬虫
scrapy genspider Top250 douban.com #创建爬虫
编辑items.py添加属性
打开items.py,在DoubanItem中添加属性
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class DoubanItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
href = scrapy.Field()
point = scrapy.Field()
rating = scrapy.Field()
爬虫
# -*- coding: utf-8 -*-
import scrapy
from douban.items import DoubanItem
class Top250Spider(scrapy.Spider):
name = 'Top250'
allowed_domains = ['movie.douban.com']
start_urls = ['https://movie.douban.com/top250?start=0']
def parse(self, response):
with open('info.html', 'wb',) as file:
file.write(response.body)
detail = response.xpath('//ol[@class="grid_view"]/li')
print(detail)
for each in detail:
item = DoubanItem()
item['href'] = each.xpath('//div[@class="hd"]/a/@href').extract()
item['title'] = each.xpath('//span[@class="title"][1]/text()').extract()
item['point'] = each.xpath('//div[@class="star"]/span[4]/text()').extract()
item['rating'] = each.xpath('//div[@class="star"]/span[2]/text()').extract()
# print(item)
yield item
运行并生成csv文件
scrapy crawl Top250 -o info.csv
运行之后出现一个info.csv文件