scrapy中pipelines

pipelines主要用于抓取的内容保存,本文显示了两种存储方式:一种json形式存储;另一种放入mysql数据库中。

#以json形式存储
import json
import codecs
class JsonWithEncodingCsdnBoKePipeline(object):
    def __init__(self):
        self.file=codecs.open('boke.json','w',encoding='utf8')
    def process_item(self,item,spider):
        line=json.dumps(dict(item),ensure_ascii=False)+"\n"
        self.file.write(line)
        return item
    def spider_closed(self,spider):
        self.file.close()

#mysql数据库存储
import MySQLdb
import datetime
dbuser = 'root'
dbpass = ''
dbname = 'chw'
dbhost = '127.0.0.1'
dbport = '3306'
class CsdnBoKePipeline(object):
    def __init__(self):
        self.conn = MySQLdb.connect(user=dbuser, passwd=dbpass,db=dbname,host=dbhost,charset="utf8",
                                    use_unicode=True)
        self.cursor = self.conn.cursor()
        # 清空表:
        self.cursor.execute("truncate table boke;")
        self.conn.commit()
    def process_item(self, item, spider):
        try:
            self.cursor.execute("""insert into boke (title,url) values(%s,%s)""",
                                (item['title'].encode('utf8'),
                                 item['url'].encode('utf8'),)
                                )
            self.conn.commit()

        except MySQLdb.Error,e:
            print e.message
        return item

setting中需要设置一下:

USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36'
# start MySQL database configure setting
MYSQL_HOST = '127.0.0.1'
MYSQL_DBNAME = 'chw'
MYSQL_USER = 'root'
MYSQL_PASSWD = ''
ITEM_PIPELINES = {
   'csdn_bo_ke.pipelines.CsdnBoKePipeline': 300,
   'csdn_bo_ke.pipelines.JsonWithEncodingCsdnBoKePipeline':300,
}

main函数:

#! /user/bin/env python
#encoding=utf-8
__author__ = 'chw'
from scrapy import cmdline
cmdline.execute("scrapy crawl csdn".split())

参考于:http://blog.csdn.net/qy20115549/article/details/52575291

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值