import scrapy
class MaoyanSpider(scrapy.Spider):
name = 'maoyan'
allowed_domains = ['maoyan.com']
start_urls = ['https://maoyan.com/films?showType=3']
def parse(self, response):
names=response.xpath("//div[@class='channel-detail movie-item-title']/a/text()").extract()
#这里我们不用extract()因为还要继续xpath
scores_div=response.xpath("//div[@class='channel-detail channel-detail-orange']")
scores=[]
for score in scores_div:
#当前节点
#print(score)
scores.append(score.xpath('string(.)').extract_first())
for name,score in zip(names,scores):
#print(name,":",score)
#yield只能推送字典或这item对象
yield {"name":name,"score":score}
yield只能推送字典或这item对象,现在我们推送的是字典,那我们怎么推送item呢
在这里插入代码片
首先去items.py[源文件]
import scrapy
class MyspiderItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
我们改成我们要的items
import scrapy
class MovieItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
score = scrapy.Field()
我们在我们的yield中调用这个MovieItem
看清这个 yield item也是缩进的
import scrapy
from MySpider.items import MovieItem
class MaoyanSpider(scrapy.Spider):
name = 'maoyan'
allowed_domains = ['maoyan.com']
start_urls = ['https://maoyan.com/films?showType=3']
def parse(self, response):
names=response.xpath("//div[@class='channel-detail movie-item-title']/a/text()").extract()
#这里我们不用extract()因为还要继续xpath
scores_div=response.xpath("//div[@class='channel-detail channel-detail-orange']")
scores=[]
for score in scores_div:
#当前节点
#print(score)
scores.append(score.xpath('string(.)').extract_first())
item=MovieItem()
for name,score in zip(names,scores):
item['name']=name
item['score']=score
yield item
我们推送后要怎么写入文件呢?
进入pipeline文件中:
import json
class MyspiderPipeline(object):
def process_item(self, item, spider):
with open('movie.txt',"a",encoding='utf-8')as f:
f.write(json.dumps(item,ensure_ascii=False))
return item
这个文件会被打开多次,文件流会太频繁不好
于是scrapy给我们提供了他的方案:
import json
class MyspiderPipeline(object):
#在爬虫开启时开启
def open_spider(self,spider):
self.filename=open('movie.txt',"a",encoding='utf-8')as f:
def process_item(self, item, spider):
self.filename.write(json.dumps(item,ensure_ascii=False))
return item
def close_spider(self,spider):
self.filename.close