爬取IMDb TOP250电影基本信息
主要代码
items:
import scrapy
class ImdbItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
rank = scrapy.Field()
movie_name = scrapy.Field()
movie_type = scrapy.Field()
director = scrapy.Field()
writer = scrapy.Field()
stars = scrapy.Field()
score = scrapy.Field()
country = scrapy.Field()
metascore = scrapy.Field()
movie_length = scrapy.Field()
year = scrapy.Field()
comment_num = scrapy.Field()
critic_num = scrapy.Field()
CWG = scrapy.Field()
# budget = scrapy.Field()
# budget_type = scrapy.Field()
spiders:
# -*- coding: utf-8 -*-
import scrapy
from imdb.items import ImdbItem
import re
import time
import copy
# scrapy crawl rank -o rank.csv
class RankSpider(scrapy.Spider):
name = 'rank'
allowed_domains = ['imdb.com']
start_urls = ['https://www.imdb.com/chart/top/?ref_=nv_mv_250']
# request top250 page, get movie url
def parse(self, response):
item = ImdbItem()
rank_list = response.xpath('//td[@class="titleColumn"]/text()').re('\d+')
movie_index = 0
for i in rank_list:
detail_url = response.xpath(