利用Scrapy爬取豆瓣电影信息主要列出Scrapy的三部分代码:
spider.py文件:
# _*_ coding=utf-8 _*_
import scrapy
from course.douban_items import DouBanItem
from scrapy.http import Request
class DouBanSpider(scrapy.Spider):
name = "DouBanSpider"
allowed_domains = ["movie.douban.com"]
start_urls = []
def start_requests(self):
file_object = open('/Users/lucas/PycharmProjects/scrapy_learn/course/course/movie_name', 'r')
try:
url_head = "http://movie.douban.com/subject_search?search_text="
for line in file_object:
self.start_urls.append(url_head+line)
for url in self.start_urls:
yield self.make_requests_from_url(url)
finally:
file_object.close()
def parse(self, response):
url = response.xpath('//*[@id="content"]/div/div[1]/div[2]/table[1]/tr/td[1]/a/@href').extract()
if url:
yield Request(url[0], callback=self.parse_item)
def parse_item(self, response):
item = DouBanItem()
item['movie_name'] = response.xpath('//*[@id="content"]/h1/span[1]/text()').extract()
item['movie_director'] = response.xpath('//*[@id="info"]/span[1]/span[2]/a/text()').extract()
item['movie_writer'] = response.xpath('//*[@id="info"]/span[2]/span[2]/a/text()').extract()
item['movie_roles'] = response.xpath('//*[@id="info"]/span[3]/span[2]/a/text()').extract()
item['movie_language'] = response.xpath('//*[@id="info"]/text()').extract()[10]
item['movie_date'] = response.xpath('//*[@id="info"]/span[11]/text()').extract()
item['movie_long'] = response.xpath('//*[@id="info"]/span[13]/text()').extract()
item['movie_description'] = response.xpath('//*[@id="link-report"]/span/text()').extract()
item["movie_score"] = response.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()').extract()
yield item
item.py文件:
# _*_ coding=utf-8 _*_
import scrapy
from scrapy import Item, Field
class DouBanItem(Item):
movie_name = Field()
movie_director = Field()
movie_writer = Field()
movie_roles = Field()
movie_language = Field()
movie_date = Field()
movie_long = Field()
movie_description = Field()
movie_score = Field()
pipeline.py文件:
# _*_ coding=utf-8 _*_
from scrapy.exceptions import DropItem
import json
class DouBanPipeline(object):
def __init__(self):
#打开文件
self.file = open('douban.json', 'w')
#该方法用于处理数据
def process_item(self, item, spider):
#读取item中的数据
line = json.dumps(dict(item)) + "\n"
#写入文件
self.file.write(line)
#返回item
return item
#该方法在spider被开启时被调用。
def open_spider(self, spider):
pass
#该方法在spider被关闭时被调用。
def close_spider(self, spider):
pass
主要代码就是以上三部分,附上我的代码下载地址,需要的朋友可以下载看看,共同学习:
http://download.csdn.net/detail/lb245557472/9851006