在该部分将pipeLine模块加上
直接上代码
qsbk.py
# -*- coding: utf-8 -*-
import scrapy
from tutorial.items import TutorialItem
from scrapy.http import response
class QsbkSpider(scrapy.Spider):
name = 'qsbk'
allowed_domains = ['qiushibaike.com']
start_urls = ['https://www.qiushibaike.com/text/page/1/']
def parse(self, response):
authors = response.xpath('//div[@class="col1 old-style-col1"]//h2/text()')
contents = response.xpath('//div[@class="col1 old-style-col1"]//div[@class="content"]/span')
for v,c in zip(authors,contents):
author = v.get().strip()
content = "".join(c.xpath('./text()').extract()).strip()
item = TutorialItem(author= author,content=content)
yield item
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class TutorialItem(scrapy.Item):
# define the fields for your item here like:
author = scrapy.Field()
content = scrapy.Field()
pass
pipelines.py代码
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import json
class TutorialPipeline(object):
def __init__(self):
self.file = open("spider.txt","w",encoding="utf-8") #打开文件
def open_spider(self,spider):
print("begin spider")
def process_item(self, item, spider):
print(dict(item))
line = json.dumps(dict(item),ensure_ascii=False)+"\n" #ensure_ascii=False防止中文编码时出现乱码
print("*"*40)
print(line)
self.file.write(line)
return item
def close_spider(self,spider):
self.file.close() #关闭文件
print("close spider")
settings.py