pipline
-
Image Pipline(爬取图片)
# settings.py IMAGES_STORE = './images' # piplines.py from scrapy import Request from scrapy.exceptions import DropItem from scrapy.piplines.images import ImagesPipline class ImagePipline(ImagesPipline): # 接收spider生成的item,取出url生成Request对象 def get_media_requests(self,item,info): yield Requests(item['url']) # 返回保存的文件名 def file_path(self,request,response=None,info=None): url = request.url file_name = url.split('/')[-1] return file_name # item下载完成时的处理方法 # results为该item对应的下载结果 def item_completed(self,results,item,info): image_paths = [x['path'] for ok,x in results if ok] if not image_paths: raise DropItem('Image Download Failed') return item
-
MysqlPipline(Mysql数据库)
# settings.py MYSQL_HOST='localhost' MYSQL_DATABASE= 'database' MYSQL_PORT = 3306 MYSQL_USER = 'root' MYSQL_PASSWORD = 'root' # piplines.py import pymysql class MysqlPipline(): def __init__(self,host,database,user,password,port): self.host = host self.database = database self.user = user self.password = password self.port = port # 拿去到settings.py中与mysql相关的参数 @classmethod def from_crawler(cls,crawler): return cls( host = crawler.settings.get('MYSQL_HOST'), database = crawler.settings.get('MYSQL_DATABASE'), user = crawler.settings.get('MYSQL_USER'), password = crawler.settings.get('MYSQL_PASSWORD'), port = crawler.settings.get('MYSQL_PORT'), ) # spider开启时被自动调用 def open_spider(self,spider): self.db = pymysql.connect(self.host,self.user,self.password,self.database,charset='utf-8',port=self.port) self.cursor= self.db.cursor() # spider关闭时被自动调用 def close_spider(self,spider): self.db.close() # process_item是必须要实现的方法,pipline会默认用这个方法对item进行处理 def process_item(self,item,spider): data = dict(item) keys = ','.join(data.keys()) values = ','.join(['%s']*len(data)) sql = '' self.cursor.execute(sql,tuple(data.vaules())) self.db.commit() return item