一、配置MYSQL
修改settings.py
# start MySQL database configure setting
MYSQL_HOST= 'localhost'MYSQL_DBNAME= 'cnblogsdb'MYSQL_USER= 'root'MYSQL_PASSWD= 'root'# end of MySQL database configure setting
修改pipelines.py
[root@bogon cnblogs]# morepipelines.py
#-*- coding: utf-8 -*-# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy import signals
import json
import codecs
from twisted.enterprise import adbapi
from datetime import datetime
from hashlib import md5
import MySQLdb
import MySQLdb.cursors
class JsonWithEncodingCnblogsPipeline(object):
def __init__(self):
self.file = codecs.open('cnblogs.json', 'w', encoding='utf-8')
def process_item(self, item, spider):
line= json.dumps(dict(item), ensure_ascii=False) + "\n"self.file.write(line)
return item
def spider_closed(self, spider):
self.file.close()
class MySQLStoreCnblogsPipeline(object):
def __init__(self, dbpool):
self.dbpool=dbpool
@classmethod
def from_settings(cls, settings):
dbargs=dict(
host=settings['MYSQL_HOST'],
db=settings['MYSQL_DBNAME'],
user=settings['MYSQL_USER'],passwd=settings['MYSQL_PASSWD'],
charset='utf8',
cursorclass=MySQLdb.cursors.DictCursor,
use_unicode=True,
)
dbpool= adbapi.ConnectionPool('MySQLdb', **dbargs)
return cls(dbpool)
#pipeline默认调用
def process_item(self, item, spider):
d=self.dbpool.runInteraction(self._do_upinsert, item, spider)
d.addErrback(self._handle_error, item, spider)
d.addBoth(lambda _: item)
return d
#将每行更新或写入数据库中
def _do_upinsert(self, conn, item, spider):
linkmd5id=self._get_linkmd5id(item)
#print linkmd5id
now= datetime.utcnow().replace(microsecond=0).isoformat(' ')
conn.execute(""" select 1 from cnblogsinfo where linkmd5id = %s""", (linkmd5id, ))
ret =conn.fetchone()ifret:
conn.execute(""" update cnblogsinfo set title = %s, description = %s, link = %s, listUrl = %s, updated = %s where linkmd5id = %s""", (item['title'], item['desc'], item['link'], item['listUrl'], now, linkmd5id))
#print """ # update cnblogsinfo set title = %s, description = %s, link = %s, listUrl = %s, updated = %s where linkmd5id = %s
#""", (item['title'], item['desc'], item['link'], item['listUrl'], now, linkmd5id)
else:
conn.execute("""insert into cnblogsinfo(linkmd5id, title, description, link, listUrl, updated)
values(%s, %s, %s, %s, %s, %s)""", (linkmd5id, item['title'], item['desc'], item['link'], item['listUrl'], now))
#print """# insert into cnblogsinfo(linkmd5id, title, description, link, listUrl, updated)
# values(%s, %s, %s, %s, %s, %s)
#""", (linkmd5id, item['title'], item['desc'], item['link'], item['listUrl'], now)
#获取url的md5编码
def _get_linkmd5id(self, item):
#url进行md5处理,为避免重复采集设计
return md5(item['link']).hexdigest()
#异常处理
def _handle_error(self, failue, item, spider):
log.err(failure)
修改setting.py配置文件,添加MySQLStoreCnblogsPipeline的支持
ITEM_PIPELINES ={'cnblogs.pipelines.JsonWithEncodingCnblogsPipeline': 300,'cnblogs.pipelines.MySQLStoreCnblogsPipeline': 300,
}
二、配置mongoDB
安装pymongo
pip install pymongo
在settings.py中配置MongoDB的IP地址、端口号、数据记录名称,可以实现方例的更换MongoDB的数据库信息。
MONGODB_HOST = '127.0.0.1'MONGODB_PORT= 27017MONGODB_DBNAME= '数据库名称'MONGODB_DOCNAME= "集合名称"
在settings.py引用pipelines.py,从而使pipelines生效
ITEM_PIPELINES = ['novelspider.pipelines.NovespiderPipeline']
在pipelines.py中应用mongoDB:
from scrapy.conf importsettingsimportpymongoclassNovespiderPipeline(object):def __init__(self):#连接mongoDB
host = settings['MONGODB_HOST']
port= settings['MONGODB_PORT']
dbName= settings['MONGODB_DBNAME']
table= settings['MONGODB_DOCNAME']
client= pymongo.MongoClient(host=host, port=port)
db=client[dbName]
self.table=db[table]defprocess_item(self, item, spider):
bookInfo=dict(item)
self.table.insert(bookInfo)return item
示例:爬取盗墓笔记九本书及各章节
1.创建项目、创建爬虫
scrapy startproject novespider
cd novespider
scrapy genspider novspider""
2.明确需求,编写items.py
打开网址:http://www.daomubiji.com/
查看页面结构: 每本书一个table
importscrapyclassNovespiderItem(scrapy.Item):
bookName=scrapy.Field()
bookTitle=scrapy.Field()
chapterNum=scrapy.Field()
chapterName=scrapy.Field()
chapterURL= scrapy.Field()
3.编写爬虫文件,spiders/novspider.py
#-*- coding: utf-8 -*-
importscrapyfrom scrapy.selector importSelectorfrom novespider.items importNovespiderItemclassNovspiderSpider(scrapy.Spider):
name= "novspider"allowed_domains= ["daomubiji.com"]
start_urls= ['http://www.daomubiji.com']defparse(self, response):
selector=Selector(response)
table= selector.xpath("//table")#新版scrapy,直接使用response.xpath("//table")
for each intable:
bookName= each.xpath("tr/td[@colspan='3']/center/h2/text()").extract()[0]
content= each.xpath("tr/td/a/text()").extract()
url= each.xpath("/tr/td/a/@href").extractfor i inrange(len(url)):
item=NovespiderItem()#书名、章节的url
item['bookName'] =bookName
item['chaperURL'] = url[1]try:#书的title、章节的编号
item['bookTitle'] = content[i].split(' ')[0]
item['chaperNum'] = content[i].split(' ')[1]exceptException as e:continue
try:
item['chapterName'] = content[i].split(' ')[2]exceptException as e:
item['chapterName'] = content[i].split(' ')[1[-3:]]yield item
三、配置redis
在settings.py配置redis
安装scrapy_redis
pip install scrapy_redis
SCHEDULER = "scrapy_redis.scheduler.Scheduler"SCHEDULER_PERSIST=True
SCHEDULER_QUEUE_CLASS=scrapy_redis.queue.SpiderPriorityQueue
REDIS_URL=None
REDIS_HOST= "127.0.0.1"REDIS_PORT= 6379
使用上面的项目和爬虫
在items.py增加text,小说正文字段text
importscrapyclassNovespiderItem(scrapy.Item):
bookName=scrapy.Field()
bookTitle=scrapy.Field()
chapterNum=scrapy.Field()
chapterName=scrapy.Field()
chapterURL=scrapy.Field()
text= scrapy.Field()
编写爬虫
#-*- coding: utf-8 -*-
importscrapyfrom scrapy_redis.spiders importRedisSpiderfrom scrapy.selector importSelectorfrom novespider.items importNovespiderItemimportreclassNovspiderSpider(RedisSpider):
name= "novspider"start_urls= ['http://www.daomubiji.com/qi-xing-lu-wang-01.html']defparse(self, response):
selector=Selector(response)
table= selector.xpath("//table")#新版scrapy,直接使用response.xpath("//table")
for each intable:
bookName= each.xpath("tr/td[@colspan='3']/center/h2/text()").extract()[0]
content= each.xpath("tr/td/a/text()").extract()
url= each.xpath("/tr/td/a/@href").extractfor i inrange(len(url)):
item=NovespiderItem()#书名、章节的url
item['bookName'] =bookName
item['chaperURL'] = url[1]try:#书的title、章节的编号
item['bookTitle'] = content[i].split(' ')[0]
item['chaperNum'] = content[i].split(' ')[1]exceptException as e:continue
try:
item['chapterName'] = content[i].split(' ')[2]exceptException as e:
item['chapterName'] = content[i].split(' ')[1[-3:]]yield scrapy.Request(url[i], callback="parseContent", meta={'item':item})defparseContent(self, response):
seletor=Selector(response)
item= response.meta['item']
html= seletor.xpath('//div[@class="content"]').extract()[0]
textField= re.search('
text= re.findall('
(.*)>
', textField, re.S)fulltext= ""
for each intext:
fulltext+=each
item['text'] =fulltextyield item