1 保存到mysql
安装mysqlclient
pip install mysqlclient
如果是linux系统
pip install libmysqlclient-dev
import pymysql class MoviesSpiderPipeline(object): def __init__(self): dbparams = { 'host': '127.0.0.1', 'port': 3306, # 注意是数字类型 'user': 'root', 'password': 'wulinlin', 'database': 'movies', 'charset': 'utf8' } self.conn = pymysql.connect(**dbparams) # 创建数据库的连接 self.cursor = self.conn.cursor() # 创建游标 self._sql = None def process_item(self, item, spider): # 执行sql语句 self.cursor.execute(self.sql, ( item['movie_title'], item['movie_title_url'], item['protagonist'], item['movie_score'], item['movie_director'], item['movie_country'], item['movie_content']) ) self.conn.commit() # 提交事务 @property def sql(self): if not self._sql: self._sql = """ insert into movies_info (id,movie_title,movie_title_url ,protagonist,movie_score,movie_director,movie_country,movie_content) values (null,%s,%s,%s,%s,%s,%s,%s) """ return self._sql return self._sql
第二种异步插入数据库的方法
from twisted.enterprise import adbapi#异步数据库处理的模块 from pymysql import cursors#导入游标模块 class JianshuTwistedPipeline(object): def __init__(self): dbparams = { 'host': '127.0.0.1', 'port': 3306, # 注意是数字类型 'user': 'root', 'password': 'wulinlin', 'database': 'jianshu', 'charset': 'utf8', 'cursorclass': cursors.DictCursor } self.dbpool = adbapi.ConnectionPool('pymysql',**dbparams) #创建连接池对象 self._sql = None @property def sql(self): if not self._sql: self._sql = """ insert into article(id,title,content,article_id,origin_url,author,avatar,pub_time) values (0,%s,%s,%s,%s,%s,%s,%s) """ return self._sql return self._sql #定义处理item函数 def proce_item(self,item,spider): defer = self.dbpool.runInteraction(self.insert_item,item)#接受返回会对象 #添加错误函数 defer.addErrback(self.handle_error,item,spider) def insert_item(self,cursor,item): cursor.execute(self.sql,( item['title'],item['content'], item['article_id'],item['origin_url'], item['author'],item['avatar'], item['pub_time'])) def handle_error(self,error,spider): print('-'*10+'error'+'-'*10) print('error') print('-' * 10 + 'error' + '-' * 10)
2 保存到Json文件中
import codecs import json class JsonWithEncodingPipelin(object): def __init__(self): self.file = codecs.open('info.json','w',encoding='utf-8') def process_item(self, item, spider): #将item转换成字符串 lines = json.dumps(dict(item),ensure_ascii=False)+'\n' #注意ensui参数 self.file.write(lines) return item #注意一定要返回item def spider_closed(self, spider): self.file.close() #关闭文件
之后注意要在settings文件中打开配置选项
使用scrapy框架的json机制
# scrapy自身提供的json读写机制 from scrapy.exporters import JsonItemExporter class JsonExportPipeline(object): def __init__(self): self.file = open('info.json','wb') self.exporter = JsonItemExporter(self.file,encoding='utf-8',ensure_ascii=False)#实例化对象 self.exporter.start_exporting() def close_spider(self,spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
之后注意要在settings文件中打开配置选项
item_load机制
修改item文件
MapCompose()可以对字段进行预处理
from scrapy.loader.processors import MapCompose,TakeFirst import datetime def add_abc(value): return value+'.abc' #对时间类型数据预处理 def change_data(values): try: create_date = datetime.datetime.striptime(values,'%Y/%m/%d'.date()) except: create_date = datetime.datetime.now().date return create_date class ArticleItem(scrapy.Item): title = scrapy.Field( input_processor = MapCompose(add_abc)#对数据进行预处理 output_processor = TakeFirst()#对于列表数据只会获取第一个值 ) content = scrapy.Field( input_processor = MapCompose(lambda x:x+'.abc') ) create_time = scrapy.Field( input_processor = MapCompose(change_data) )
自定义itemloader类(附加)
from scrapy.loader import ItemLoader class Articleitemloade(ItemLoader): #自定义类 default_output_processor = TakeFirst()
通过item__load加载item
item_loader = ItemLoader(item = ArticleItem(),response) item_loader.add_css('字段名','解析规则') item_loader.add_value('字段名',response.url) articel_item = item_loader.load_item() yield article_item