例子:爬取www.daomubiji.com的文章
特别注意:spider文件夹下的.py文件千万不要与项目文件名相同 否则: from 项目文件.items import xxxItem时会报错:ImportError: No module named items
最主要的变化:settings.py文件
BOT_NAME = 'novelspider'
SPIDER_MODULES = ['novelspider.spiders']
NEWSPIDER_MODULE = 'novelspider.spiders'
ITEM_PIPELINES = ['novelspider.pipelines.NovelspiderPipeline']
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.54 Safari/536.5'
COOKIES_ENABLED = True
MONGODB_HOST = '127.0.0.1'
MONGODB_PORT = 27017
MONGODB_DBNAME = 'Jikexueyuan'
MONGODB_DOCNAME = 'daomubiji'
设置pipelines.py: settings.py中的数据传入
# -*- coding: utf-8 -*-
from items import NovelspiderItem
from scrapy.conf import settings
import pymongo
class NovelspiderPipeline(object):
def __init__(self):
host = settings['MONGODB_HOST']
port = settings['MONGODB_PORT']
dbName = settings['MONGODB_DBNAME']
client = pymongo.MongoClient(host=host,port=port)
tdb = client[dbName]
self.post = tdb[settings['MONGODB_DOCNAME']]
def process_item(self,item,spider):
bookInfo = dict(item)
self.post.insert(bookInfo)
return item
# -*- coding: utf-8 -*-
from scrapy import Field, Item
class NovelspiderItem(Item):
# define the fields for your item here like:
# name = scrapy.Field()
bookName = Field()
bookTitle = Field()
chapterNum = Field()
chapterName = Field()
chapterURL = Field()
# -*- coding:utf8 -*-
from scrapy.contrib.spiders import CrawlSpider
from scrapy.selector import Selector
from novelspider.items import NovelspiderItem
#Pycharm在这里会提示找不到novelspider,这是因为这个爬虫是放在program这个大的文件夹下的
#这属于误报,程序可以正常运行。
class novSpider(CrawlSpider):
name = "novspider"
redis_key = 'nvospider:start_urls'
start_urls = ['http://www.daomubiji.com']
def parse(self, response):
selector = Selector(response)
table = selector.xpath('//table')
for each in table:
bookName = each.xpath('tr/td[@colspan="3"]/center/h2/text()').extract()[0]
content = each.xpath('tr/td/a/text()').extract()
url = each.xpath('tr/td/a/@href').extract()
for i in range(len(url)):
item = NovelspiderItem()
item['bookName'] = bookName
item['chapterURL'] = url[i]
try:
item['bookTitle'] = content[i].split(' ')[0]
item['chapterNum'] = content[i].split(' ')[1]
except Exception,e:
continue
try:
item['chapterName'] = content[i].split(' ')[2]
except Exception,e:
item['chapterName'] = content[i].split(' ')[1][-3:]
yield item
最后设置main.py文件:
from scrapy import cmdline
cmdline.execute("scrapy crawl novspider".split())