这段时间我在一个爬虫项目写了两个蜘蛛(http://blog.csdn.net/mr_blued?t=1),都需要通过piplines将数据保存到Mysql数据库,所以在piplines写了两个类。
一个MoviePipeline(),一个BookPipline()
import pymysql
'''
class MoviePipeline(object):
def __init__(self):
# 连接数据库
self.conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='1likePython',
db='TESTDB', charset='utf8')
# 建立游标对象
self.cursor = self.conn.cursor()
self.cursor.execute('truncate table Movie')
self.conn.commit()
def process_item(self, item, spider):
try:
self.cursor.execute("insert into Movie (name,movieInfo,star,number,quote) \
VALUES (%s,%s,%s,%s,%s)", (item['movie_name'],item['movie_message'],item['movie_star'],
item['number'], item['movie_quote']))
self.conn.commit()
except pymysql.Error:
print("Error%s,%s,%s,%s,%s" % (item['movie_name'],item['movie_message'],item['movie_star'],
item['number'], item['movie_quote']))
return item
class BookPipeline(object):
def __init__(self):
# 连接数据库
self.conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='1likePython',
db='TESTDB', charset='utf8')
# 建立游标对象
self.cursor = self.conn.cursor()
self.cursor.execute('truncate table Book')
self.conn.commit()
def process_item(self, item, spider):
try:
self.cursor.execute("insert into Book (book_name,author,book_type,book_state,book_update,book_time,new_href,book_intro) \
VALUES (%s,%s,%s,%s,%s,%s,%s,%s)", (item['book_name'], item['author'], item['book_type'],
item['book_state'], item['book_update'], item['book_time'],
item['new_href'], item['book_intro']))
self.conn.commit()
except pymysql.Error:
print("Error%s,%s,%s,%s,%s,%s,%s,%s" % (item['book_name'], item['author'], item['book_type'],
item['book_state'], item['book_update'], item['book_time'],
item['new_href'], item['book_intro']))
return item
'''
接着我在settings.py中对这两个类进行了设置
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'Mycrawl.pipelines.MoviePipeline': 100,
'Mycrawl.pipelines.BookPipeline': 300,
}
接着运行爬虫,我发现运行 book 爬虫时使用的 piplines 类是 MoviePipline(),显然这样子会报错,如果不想报错,就要在setting.py中将这一行给注释掉
'Mycrawl.pipelines.MoviePipeline': 100,
然而等我想用movie爬虫的时候又需要将该注释给去掉,将另一行给注释起来,这样子就会变得很麻烦。
所以我在piplines.py中对代码进行了修改,让其对现在进行的爬虫名进行判断,修改如下:
class MycrawlPipeline(object):
def __init__(self):
# 连接数据库
self.conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='1likePython',
db='TESTDB', charset='utf8')
# 建立游标对象
self.cursor = self.conn.cursor()
self.cursor.execute('truncate table Movie')
self.cursor.execute('truncate table Book')
self.conn.commit()
def process_item(self, item, spider):
# 如果爬虫名是movie
if spider.name == 'movie':
try:
self.cursor.execute("insert into Movie (name,movieInfo,star,number,quote) \
VALUES (%s,%s,%s,%s,%s)", (item['movie_name'],item['movie_message'],item['movie_star'],
item['number'], item['movie_quote']))
self.conn.commit()
except pymysql.Error:
print("Error%s,%s,%s,%s,%s" % (item['movie_name'],item['movie_message'],item['movie_star'],
item['number'], item['movie_quote']))
return item
# 如果爬虫名是book
elif spider.name == 'book':
try:
self.cursor.execute("insert into Book (book_name,author,book_type,book_state,book_update,book_time,new_href,book_intro) \
VALUES (%s,%s,%s,%s,%s,%s,%s,%s)", (item['book_name'], item['author'], item['book_type'],
item['book_state'], item['book_update'], item['book_time'],
item['new_href'], item['book_intro']))
self.conn.commit()
except pymysql.Error:
print("Error%s,%s,%s,%s,%s,%s,%s,%s" % (item['book_name'], item['author'], item['book_type'],
item['book_state'], item['book_update'], item['book_time'],
item['new_href'], item['book_intro']))
return item
这样,只需写一个piplines类即可让蜘蛛与其pipline一一对应起来了。