scrapy爬取三国演义
之前的博客中有一篇是用request模块进行三国演义的爬取,这次我们使用scrapy框架来爬取,同时涉及到scrapy的一个新知识点
import scrapy
from ..items import SanguoItem
class SpidersanguoSpider(scrapy.Spider):
name = 'spidersanguo'
# allowed_domains = ['www.xxxx.com']
start_urls = ['https://so.gushiwen.cn/search.aspx?type=guwen&page=1&value=三国演义']
page_Num=2
url='https://so.gushiwen.cn/search.aspx?type=guwen&page=%d&value=三国演义'
def parse(self, response):
href_list=response.xpath("//div[@class='sons']/div[@class='cont']/p[1]/a/@href").extract()
item = SanguoItem()
for href in href_list:
complete_href="https://so.gushiwen.cn"+href
yield scrapy.Request(url=complete_href,callback=self.content,meta={'item':item})#callback回调解析数据,请求传参,通过meta将定义的item类传递到content方法中
if(self.page_Num<=12):
new_url=format(self.url%self.page_Num)
self.page_Num+=1
yield scrapy.Request(url=new_url,callback=self.parse)
def content(self,response):
item=response.meta['item']#使用response接收从上一个方法中传递过来的item
title=response.xpath("//div[@class='cont']/h1/span/b/text()")[0].extract()+response.xpath("//div[@class='contson']/p[1]/text()")[0].extract()
main_content=response.xpath("//div[@class='contson']/p/text()").extract()
main_content="".join(main_content)
item['title']=title
item['main_content']=main_content
yield item
上述代码中,涉及到参数的传递和回调函数的使用
yield scrapy.Request(url,callback):callback专门用做于数据解析
meta进行参数的传递
item类:
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class SanguoItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# pass
title=scrapy.Field()
main_content=scrapy.Field()
管道类:
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import pymysql
class SanguoPipeline:
conn=None
cursor=None
def open_spider(self,spider):
print("开始爬取三国演义")
self.conn=pymysql.Connect(host='127.0.0.1',port=3306,user='root',passwd='zhengyunyu524',db='pythontest',charset="utf8")
def process_item(self, item, spider):
self.cursor=self.conn.cursor()
title=item['title']
main_content=item['main_content']
try:
self.cursor.execute("insert into sanguo values('%s','%s')"%(title,main_content))
self.conn.commit()
except Exception as e:
print(e)
self.conn.rollback()
return item
def close_spider(self,spider):
print("三国演义爬取结束")
self.cursor.close()
self.conn.close()
爬取结果: