首先安装mysqlclient包,推荐使用豆瓣源安装,速度不仅快,还不容易出错。
pip install -i https://pypi.douban.com/simple/ mysqlclient
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import codecs
import json
import MySQLdb
import MySQLdb.cursors
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exporters import JsonItemExporter
class ArticlespiderPipeline(object):
def process_item(self, item, spider):
return item
class JsonWithEncodingPipeline(object):
#自定义json文件的导出
def __init__(self):
self.file = codecs.open('article.json', 'w', encoding="utf-8")
def process_item(self, item, spider):
lines = json.dumps(dict(item), ensure_ascii=False) + "\n"#确保中文显示正常
self.file.write(lines)
return item
def spider_closed(self, spider):
self.file.close()
class JsonExporterPipeline(object):
#调用scrapy提供的json export 导出json文件
def __init__(self):
self.file = open('articleexport.json', 'wb')
self.exporter = JsonItemExporter(self.file, encoding = "utf-8", ensure_ascii=False)
self.exporter.start_exporting()
def process_item(self, item, spider):
lines = json.dumps(dict(item), ensure_ascii=False) + "\n"#确保中文显示正常
self.exporter.export_item(item)
return item
def close_spider(self,spider):
self.exporter.finish_exporting()
class MysqlPipeline(object):
def __init__(self):
self.conn = MySQLdb.connect('localhost', 'root', '1234', 'jobbole', charset = 'utf8', use_unicode = True)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
insert_sql = """
insert into jobbole_article(title, create_date, url, fav_nums)
VALUES (%s, %s, %s, %s)
"""
self.cursor.execute(insert_sql, (item["title"], item["create_date"], item["url"], item["fav_nums"]))
self.conn.commit()
class ArticleImagePipeline(ImagesPipeline):
def item_completed(self,results,item,info):
for ok,value in results:
image_file_path = value["path"]
item["front_image_path"] = image_file_path
return item
我遇到的几个问题是:①
def __init__(self):
self.conn = MySQLdb.connect('localhost', 'root', '1234', 'jobbole', charset = 'utf8', use_unicode = True)
self.cursor = self.conn.cursor()
在这个函数中我把主机名弄成Jobbole,即我的连接名称,但这是错的,应该改为主机名或ip地址名,,我在设置Jobbole这个连接时使用的是localhost。
②就是url_object_id得有一个默认值,我感觉没设置成主键会有小问题的,如果重复了怎么办。
所以我就将url_object_id设置为主键,但是再爬取时只能爬取到第一条数据。并报错。
现在还不知道如何解决这个问题。