item与pipeline方法对应的问题
一个spider爬去多个信息,需要输出多张表的时候容易有此类问题。pipeline方法是item的输出节点中的统一处理方法,pipeline 只是对所有结果做处理的一个东西,属于结果输出路径上的一个点。
笨方法:
需要对爬取的不同信息做定制话输出,需要修改pipeline,引入判断
# 具体插入数据
#引入判断
if item.__class__.__name__ == 'ZhihuAnswerItem':
insert_sql = '''
insert into answer_database(author_name, author_id, answer_content, answer_url, question_id,answer_parise_num,answer_comments_num,answer_creat_time,answer_update_time,answer_crawl_time)VALUES( %s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
'''
cursor.execute(insert_sql, (item['author_name'],item['author_id'],item['answer_content'],item['answer_url'],item['question_id'],item['answer_parise_num'],item['answer_comments_num'],item['answer_creat_time'],item['answer_update_time'],item['answer_crawl_time']))
#引入判断
if item.__class__.__name__ == 'ZhihuQuestionItem': #当传入的irem是ZhihuQuestionItem时执行
insert_sql = '''
insert into question_database(question_id, question_url, title, topic, answer_num, comment_num, focus_num, watch_num, content)VALUES( %s,%s,%s,%s,%s,%s,%s,%s,%s)
'''
cursor.execute(insert_sql, (
item['question_id'], item['question_url'], item['question_title'], item['topic'], item['answer_num'],
item['comment_num'], item['focus_num'], item['watch_num'], item['content']))
好方法:
更好的解决方法:在item中定义好insert或其他的方法,在item传入pipeline中时,直接调用各自item的insert方法,具有高配置性。
class ZhihuAnswerItem(scrapy.Item):
author_name = scrapy.Field()
author_id = scrapy.Field()
answer_content = scrapy.Field(
input_processor=MapCompose(soup)
)
answer_url = scrapy.Field()
question_id = scrapy.Field()
answer_parise_num = scrapy.Field()
answer_comments_num = scrapy.Field()
answer_creat_time = scrapy.Field(
input_processor=MapCompose(timestamp_covert_to_datetime)
)
answer_update_time = scrapy.Field(
input_processor=MapCompose(timestamp_covert_to_datetime)
)
answer_crawl_time = scrapy.Field()
def get_insert(self):
# ON DUPLICATE KEY UPDATE answer_content=VALUES (answer_content),answer_parise_num=VALUES (answer_parise_num),answer_comments_num=VALUES (answer_comments_num),answer_update_time=VALUES (answer_update_time) 解决答案更新时,重写爬取,但主键冲突问题
insert_sql = '''
insert into answer_database(author_name, author_id, answer_content, answer_url, question_id,answer_parise_num,answer_comments_num,answer_creat_time,answer_update_time,answer_crawl_time)VALUES( %s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE answer_content=VALUES (answer_content),answer_parise_num=VALUES (answer_parise_num),answer_comments_num=VALUES (answer_comments_num),answer_update_time=VALUES (answer_update_time)
'''
params = (self['author_name'],self['author_id'],self['answer_content'],self['answer_url'],self['question_id'],self['answer_parise_num'],self['answer_comments_num'],self['answer_creat_time'],self['answer_update_time'],self['answer_crawl_time'])
return insert_sql,params
class ZhihuQuestionItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
question_id = scrapy.Field()
question_url = scrapy.Field()
question_title = scrapy.Field()
topic = scrapy.Field(
output_processor = list_to_str #重写output_processor,覆盖原有的
)
answer_num = scrapy.Field(
input_processor=MapCompose(get_num)
)
comment_num = scrapy.Field(
input_processor=MapCompose(get_num)
)
focus_num = scrapy.Field()
watch_num = scrapy.Field()
content = scrapy.Field()
def get_insert(self):
insert_sql = '''
insert into question_database(question_id, question_url, title, topic, answer_num, comment_num, focus_num, watch_num, content)VALUES( %s,%s,%s,%s,%s,%s,%s,%s,%s)
'''
params = (self['question_id'], self['question_url'], self['question_title'], self['topic'], self['answer_num'],
self['comment_num'], self['focus_num'], self['watch_num'], self['content'])
return insert_sql,params