数据存储第一种:本地json
import json
class JsonPipeline(object):
def __init__(self):
self.file = open('job.json','w',encoding='utf-8')
def process_item(self, item, spider):
line = json.dumps(dict(item),ensure_ascii=False) + '\n'
self.file.write(line)
return item
def close_spider(self,spider):
self.file.close()
在settings中添加管道 ITEM_PIPELINES = {'Lagou.pipelines.JsonPipeline': 100}
数据存储第二种:mongoDB
import pymongo
class MongoPipeline(object):
def __init__(self):
#mongo_uri = settings.get('MONGO_URI')# localhost:27017
#mongo_db = settings.get('MONGO_DB')#数据库名
self.client = pymongo.MongoClient(host='127.0.0.1', port=27017)
self.db = self.client['lagou']
def process_item(self, item, spider):
self.db['拉钩关键词招聘信息表'].insert(dict(item))
#self.db['拉钩关键词招聘信息表'].update({'positionId':item['positionId']},dict(item),True)
return item
def close_spider(self, spider):
self.client.close()
在settings中添加管道 ITEM_PIPELINES = {'Lagou.pipelines.MongoPipeline': 100}
数据存储第三种:mysql(#mysql插入数据,execute、commit同步,后面爬取的item多可能造成堵塞)
import pymysql
class MysqlPipeline(object):#mysql插入数据,execute、commit同步,后面爬取的item多可能造成堵塞
def __init__(self):
self.conn =pymysql.connect(host='localhost', port=3306, db='lagou', user='root', passwd='123456', charset='utf8')
self.cursor = self.conn.cursor()#执行数据库的操作是由cursor完成的
def process_item(self, item, spider):
sql = 'INSERT INTO job_info VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
values = (item['positionId'],item['city'],item['positionName'],item['salary'],item['workYear'],item['education'],
item['companyShortName'],item['companyFullName'],item['companySize'],
item['industryField'],item['positionAdvantage'],item['createTime'])
self.cursor.execute(sql, values)
self.conn.commit()
return item
def close_spider(self, spider):
self.conn.close()
在settings中添加管道 ITEM_PIPELINES = {'Lagou.pipelines.MysqlPipeline': 100}
数据存储第四种:mysql之异步插入(关系型)
原因:spider解析速度超过入库(关系型数据库)速度,到后期爬取的URL越来越多,也就是item越来越多,插入数据库 速度比不上,造成堵塞。
办法:twisted框架给我们提供了一种可以将mysql插入(关系型数据库)异步化的操作,将mysql的execute、commit 同步操作变成异步操作。
工具:利用的就是twisted框架提供的工具--连接池(将mysql同步操作转成异步操作)
import pymysql
from twisted.enterprise import adbapi#twisted的enterprise中有一个模块adbapi,可以将我们的mysql操作变成异步的操作
class MySQLTwistedPipeline(object):
def __init__(self,dbpool):
self.dbpool = dbpool
@classmethod
def from_settings(cls,settings):
dbparms = dict(
host = settings['MYSQL_HOST'],
db = settings['MYSQL_DB_NAME'],
user = settings['MYSQL_USER'],
passwd = settings['MYSQL_PASSWORD'],
charset = 'utf8',
cursorclass = pymysql.cursors.DictCursor
)#参数名称是固定的的写法
dbpool = adbapi.ConnectionPool('pymysql',**dbparms)
return cls(dbpool)
def process_item(self, item, spider):
#使用twisted将mysql插入变成异步执行
query = self.dbpool.runInteraction(self.do_insert,item)
query.addErrback(self.handle_error)#处理异常
return item
def handle_error(self,failure):#处理异步插入的异常
print(failure)
def do_insert(self,cursor,item):#执行具体的插入
sql = 'INSERT INTO article VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s)'
values = (item['title'], item['url'], item['img_url'], item['url_object_id'], item['img_path'],
item['tags'], item['like_nums'], item['collection_nums'], item['comment_nums'])
cursor.execute(sql, values)
settings中的设置为:
ITEM_PIPELINES = { #'Lagou.pipelines.JsonPipeline': 100, #'Lagou.pipelines.MongoPipeline': 200, #'Lagou.pipelines.MysqlPipeline': 300, 'Lagou.pipelines.MysqlTwistedPipeline': 400 }
MYSQL_DB_NAME = 'lagou' MYSQL_HOST = 'localhost' MYSQL_USER = 'root' MYSQL_PASSWORD = '123456'