用SCRAPY爬取豆瓣
Items.py
import scrapy
class Douban1Item(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
names = scrapy.Field()
actors = scrapy.Field()
scores= scrapy.Field()
webs = scrapy.Field()
douban.py
import scrapy
from douban1.items import Douban1Item
class DoubanSpider(scrapy.Spider):
header={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36'}
name = 'douban'
allowed_domains = ['movie.douban.com']
start_urls = ['https://movie.douban.com/chart']
def parse(self, response):
selector=response.xpath('//div[@class="indent"]//tr[@class="item"]')
for movie in selector:
item = Douban1Item()
item['names'] =movie.xpath("//a[@class='nbg']/@title").extract()
print('\t')
item['actors']= movie.xpath("//p[@class='pl']/text()").extract()
print('\t')
item['scores'] = movie.xpath("//span[@class='rating_nums']/text()").extract()
print('\t')
item['webs'] = movie.xpath("//a[@class='nbg']/@href").extract()
print('\t')
return item
pipelines.py
写入txt文件如下:
class Douban1Pipeline(object):
def process_item(self, item, spider):
with open('douban.txt', 'a+') as fp:
fp.write(str(item['names'])+'\n\n')
fp.write(str(item['scores'])+'\n\n')
fp.write(str(item['actors'])+'\n\n')
fp.write(str(item['webs'])+'\n\n')
pipelines.py
写入json文件如下:
class Douban1Pipeline(object):
def open_spider(self,spider):
self.filename=open('douban.json','a')
def process_item(self, item, spider):
content=json.dumps(dict(item),ensure_ascii=False)+'\t\t\t\t\t'
self.filename.write(content)
return item
def close_spider(self,spider):
self.filename.close()