scrapy-redis mysql_scrapy使用五:scrapy配置mysql、mongodb和redis

一、配置MYSQL

修改settings.py

# start MySQL database configure setting

MYSQL_HOST= 'localhost'MYSQL_DBNAME= 'cnblogsdb'MYSQL_USER= 'root'MYSQL_PASSWD= 'root'# end of MySQL database configure setting

修改pipelines.py

[root@bogon cnblogs]# morepipelines.py

#-*- coding: utf-8 -*-# Define your item pipelines here

#

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

from scrapy import signals

import json

import codecs

from twisted.enterprise import adbapi

from datetime import datetime

from hashlib import md5

import MySQLdb

import MySQLdb.cursors

class JsonWithEncodingCnblogsPipeline(object):

def __init__(self):

self.file = codecs.open('cnblogs.json', 'w', encoding='utf-8')

def process_item(self, item, spider):

line= json.dumps(dict(item), ensure_ascii=False) + "\n"self.file.write(line)

return item

def spider_closed(self, spider):

self.file.close()

class MySQLStoreCnblogsPipeline(object):

def __init__(self, dbpool):

self.dbpool=dbpool

@classmethod

def from_settings(cls, settings):

dbargs=dict(

host=settings['MYSQL_HOST'],

db=settings['MYSQL_DBNAME'],

user=settings['MYSQL_USER'],passwd=settings['MYSQL_PASSWD'],

charset='utf8',

cursorclass=MySQLdb.cursors.DictCursor,

use_unicode=True,

)

dbpool= adbapi.ConnectionPool('MySQLdb', **dbargs)

return cls(dbpool)

#pipeline默认调用

def process_item(self, item, spider):

d=self.dbpool.runInteraction(self._do_upinsert, item, spider)

d.addErrback(self._handle_error, item, spider)

d.addBoth(lambda _: item)

return d

#将每行更新或写入数据库中

def _do_upinsert(self, conn, item, spider):

linkmd5id=self._get_linkmd5id(item)

#print linkmd5id

now= datetime.utcnow().replace(microsecond=0).isoformat(' ')

conn.execute(""" select 1 from cnblogsinfo where linkmd5id = %s""", (linkmd5id, ))

ret =conn.fetchone()ifret:

conn.execute(""" update cnblogsinfo set title = %s, description = %s, link = %s, listUrl = %s, updated = %s where linkmd5id = %s""", (item['title'], item['desc'], item['link'], item['listUrl'], now, linkmd5id))

#print """ # update cnblogsinfo set title = %s, description = %s, link = %s, listUrl = %s, updated = %s where linkmd5id = %s

#""", (item['title'], item['desc'], item['link'], item['listUrl'], now, linkmd5id)

else:

conn.execute("""insert into cnblogsinfo(linkmd5id, title, description, link, listUrl, updated)

values(%s, %s, %s, %s, %s, %s)""", (linkmd5id, item['title'], item['desc'], item['link'], item['listUrl'], now))

#print """# insert into cnblogsinfo(linkmd5id, title, description, link, listUrl, updated)

# values(%s, %s, %s, %s, %s, %s)

#""", (linkmd5id, item['title'], item['desc'], item['link'], item['listUrl'], now)

#获取url的md5编码

def _get_linkmd5id(self, item):

#url进行md5处理,为避免重复采集设计

return md5(item['link']).hexdigest()

#异常处理

def _handle_error(self, failue, item, spider):

log.err(failure)

修改setting.py配置文件,添加MySQLStoreCnblogsPipeline的支持

ITEM_PIPELINES ={'cnblogs.pipelines.JsonWithEncodingCnblogsPipeline': 300,'cnblogs.pipelines.MySQLStoreCnblogsPipeline': 300,

}

二、配置mongoDB

安装pymongo

pip install pymongo

在settings.py中配置MongoDB的IP地址、端口号、数据记录名称,可以实现方例的更换MongoDB的数据库信息。

MONGODB_HOST = '127.0.0.1'MONGODB_PORT= 27017MONGODB_DBNAME= '数据库名称'MONGODB_DOCNAME= "集合名称"

在settings.py引用pipelines.py,从而使pipelines生效

ITEM_PIPELINES = ['novelspider.pipelines.NovespiderPipeline']

在pipelines.py中应用mongoDB:

from scrapy.conf importsettingsimportpymongoclassNovespiderPipeline(object):def __init__(self):#连接mongoDB

host = settings['MONGODB_HOST']

port= settings['MONGODB_PORT']

dbName= settings['MONGODB_DBNAME']

table= settings['MONGODB_DOCNAME']

client= pymongo.MongoClient(host=host, port=port)

db=client[dbName]

self.table=db[table]defprocess_item(self, item, spider):

bookInfo=dict(item)

self.table.insert(bookInfo)return item

示例:爬取盗墓笔记九本书及各章节

1.创建项目、创建爬虫

scrapy startproject novespider

cd novespider

scrapy genspider novspider""

2.明确需求,编写items.py

打开网址:http://www.daomubiji.com/

查看页面结构: 每本书一个table

importscrapyclassNovespiderItem(scrapy.Item):

bookName=scrapy.Field()

bookTitle=scrapy.Field()

chapterNum=scrapy.Field()

chapterName=scrapy.Field()

chapterURL= scrapy.Field()

3.编写爬虫文件,spiders/novspider.py

#-*- coding: utf-8 -*-

importscrapyfrom scrapy.selector importSelectorfrom novespider.items importNovespiderItemclassNovspiderSpider(scrapy.Spider):

name= "novspider"allowed_domains= ["daomubiji.com"]

start_urls= ['http://www.daomubiji.com']defparse(self, response):

selector=Selector(response)

table= selector.xpath("//table")#新版scrapy,直接使用response.xpath("//table")

for each intable:

bookName= each.xpath("tr/td[@colspan='3']/center/h2/text()").extract()[0]

content= each.xpath("tr/td/a/text()").extract()

url= each.xpath("/tr/td/a/@href").extractfor i inrange(len(url)):

item=NovespiderItem()#书名、章节的url

item['bookName'] =bookName

item['chaperURL'] = url[1]try:#书的title、章节的编号

item['bookTitle'] = content[i].split(' ')[0]

item['chaperNum'] = content[i].split(' ')[1]exceptException as e:continue

try:

item['chapterName'] = content[i].split(' ')[2]exceptException as e:

item['chapterName'] = content[i].split(' ')[1[-3:]]yield item

三、配置redis

在settings.py配置redis

安装scrapy_redis

pip install scrapy_redis

SCHEDULER = "scrapy_redis.scheduler.Scheduler"SCHEDULER_PERSIST=True

SCHEDULER_QUEUE_CLASS=scrapy_redis.queue.SpiderPriorityQueue

REDIS_URL=None

REDIS_HOST= "127.0.0.1"REDIS_PORT= 6379

使用上面的项目和爬虫

在items.py增加text,小说正文字段text

importscrapyclassNovespiderItem(scrapy.Item):

bookName=scrapy.Field()

bookTitle=scrapy.Field()

chapterNum=scrapy.Field()

chapterName=scrapy.Field()

chapterURL=scrapy.Field()

text= scrapy.Field()

编写爬虫

#-*- coding: utf-8 -*-

importscrapyfrom scrapy_redis.spiders importRedisSpiderfrom scrapy.selector importSelectorfrom novespider.items importNovespiderItemimportreclassNovspiderSpider(RedisSpider):

name= "novspider"start_urls= ['http://www.daomubiji.com/qi-xing-lu-wang-01.html']defparse(self, response):

selector=Selector(response)

table= selector.xpath("//table")#新版scrapy,直接使用response.xpath("//table")

for each intable:

bookName= each.xpath("tr/td[@colspan='3']/center/h2/text()").extract()[0]

content= each.xpath("tr/td/a/text()").extract()

url= each.xpath("/tr/td/a/@href").extractfor i inrange(len(url)):

item=NovespiderItem()#书名、章节的url

item['bookName'] =bookName

item['chaperURL'] = url[1]try:#书的title、章节的编号

item['bookTitle'] = content[i].split(' ')[0]

item['chaperNum'] = content[i].split(' ')[1]exceptException as e:continue

try:

item['chapterName'] = content[i].split(' ')[2]exceptException as e:

item['chapterName'] = content[i].split(' ')[1[-3:]]yield scrapy.Request(url[i], callback="parseContent", meta={'item':item})defparseContent(self, response):

seletor=Selector(response)

item= response.meta['item']

html= seletor.xpath('//div[@class="content"]').extract()[0]

textField= re.search('

style="clear:both">
(.*>)

text= re.findall('

(.*)>

', textField, re.S)

fulltext= ""

for each intext:

fulltext+=each

item['text'] =fulltextyield item

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值