#注意 管道文件一定要看settings.py是否开启不然不会执行
# -*- coding: utf-8 -*-
import scrapy
from bs4 import BeautifulSoup as bs
import re
from douban.items import DoubanItem #这里是要引入items字段
#scrapy crawl dou
class DouSpider(scrapy.Spider):
name = 'dou' #爬虫名字
start_urls = ['https://movie.douban.com/subject/30314127/reviews'] #需要爬的链接
def parse(self, response):
html = response.text
html = bs(html,'lxml')
cont = html.findAll('div',class_='main review-item')
for i in cont:
item = DoubanItem()#这里就是用items 字段来存获取的东西 相当于一个字典
name = i.header.text
name = re.sub('\n','',name)
con = i.div.text
con = re.sub('\n','',con)
con = re.sub(' ','',con)
item['name'] = name
item['con'] = con
#self.log(name)
#self.log(con)
#self.log('\n')
#self.log('\n')
yield item #存好就可以yield 这个是时时返回并不结束程序
import scrapy #这个就是定义items 字段了
class DoubanItem(scrapy.Item):
name = scrapy.Field()
con = scrapy.Field()
#管道文件存items 字段过来的内容
class DoubanPipeline(object):
def process_item(self, item, spider):
with open("douban.txt", "a",encoding='utf-8')as f:
f.write(item['name'])
f.write('\n')
f.write(item['con'])
f.write('\n')
f.write('\n')
f.write('\n')
return item