scrapy框架爬取八一中文网小说
1、创建项目,定义文件名
1、创建项目,定义文件名
# -*- coding: utf-8 -*-
import scrapy
class ZwwSpider(scrapy.Spider):
name = 'zww'
allowed_domains = ['81zw.com']
start_urls = ['https://www.81zw.com/book/14662/3762757.html']
def parse(self, response):
title=response.xpath('//h1/text()').extract_first()
#TypeError: can only concatenate str (not "list") to str,下面返回的是一个列表,需要转为字符串
content = ''.join(response.xpath('//div[@id="content"]/text()').extract()).replace(' ', '\n')
yield {
'title':title,
'content':content
}
#第一种写法
# next_url=response.xpath('//div[@class="bottem2"]/a[3]/@href').extract_first()
# base_url = 'https://www.81zw.com/{}'.format(next_url)
# # TypeError: can only concatenate str (not "SelectorList") to str 前面next_url忘记加extract_fist()
# yield scrapy.Request(base_url,self.parse)
#第二种写法
next_url=response.xpath('//div[@class="bottem2"]/a[3]/@href').extract_first()
#base_url = 'https://www.81zw.com/{}'.format(next_url)
if response.url.find('.html')!=-1:
#response.urljoin 自动捕捉完整的url
yield scrapy.Request(response.urljoin(next_url),self.parse)
#如果有新的解析,可以自定义解析方式
def parse_info(self,response):
pass
2、使用pipeline保存文件
2、使用pipeline保存文件
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
class XiaoshuoPipeline:
def open_spider(self,spider):
self.filename=open('wddf.txt','w',encoding='utf-8')
def process_item(self, item, spider):
title=item['title']
content=item['content']
#TypeError: can only concatenate list (not "str") to list
#错误原因是xpath解析的格式不统一
#info = title + '\n' + content + '\n'
info = title + '\n'
self.filename.write(info)
#如果文件里没有内容,是因为字节流的原因,每次填充几十条不会显示,刷新一下
self.filename.flush()
return item
def close_spider(self,spider):
self.filename.close()