创建爬虫
scrapy startproject qidian
cd qidian
scrapy genspider finish www.qidian.com/finish?
编写finish爬虫
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
from qidian.items import QidianItem
class FinishSpider(scrapy.Spider):
name = 'finish'
# allowed_domains = ['www.qidian.com/finish?']
start_urls = ['https://www.qidian.com/finish?/']
def parse(self, response):
item = QidianItem()
novels = response.xpath("//ul[@class='all-img-list cf']/li")
for novel in novels:
item['name'] = novel.xpath("./div/h4/a/text()").extract_first()
item['author'] = novel.xpath("./div/p/a[@class='name']/text()").extract_first()
item['style'] = novel.xpath("./div/p/a[2]/text()").extract_first()
yield item
next_url = response.xpath("//a[contains(@class,'next')]/@href").extract_first()
if(next_url[:4]!="java"):
next_url = "https:" + str(next_url)
yield Request(next_url,callback=self.parse)
else:
print("结束")
不服从robot协议
settings文件中需要修改
ROBOTSTXT_OBEY = False
items文件中确定需要爬取的内容
import scrapy
class QidianItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
author = scrapy.Field()
style = scrapy.Field()
pipelines文件中保存到mongodb第一种方法
import pymongo
class QidianPipeline(object):
def process_item(self, item, spider):
return item
class MongoPipeline():
def __init__(self,mongo_url,mongo_db,mongo_table):
self.mongo_url = mongo_url
self.mongo_db = mongo_db
self.mongo_table = mongo_table
@classmethod
def from_crawler(cls,crawl):
'''
scrapy为我们访问settings提供了这样的一个方法,这里,
我们需要从settings.py文件中,取得数据库的URI和数据库名称
'''
return cls(
mongo_url=crawl.settings.get('MONGO_URL'),
mongo_db=crawl.settings.get('MONGO_DB'),
mongo_table = crawl.settings.get('MONGO_TABLE')
)
def open_spider(self,spider):
'''
爬虫一旦开启,就会实现这个方法,连接到数据库
'''
self.client = pymongo.MongoClient(self.mongo_url)
self.db = self.client[self.mongo_db]
def close_spider(self):
'''
爬虫一旦关闭,就会实现这个方法,关闭数据库连接
'''
self.client.close()
def process_item(self,item,spider):
'''
每个实现保存的类里面必须都要有这个方法,且名字固定,用来具体实现怎么保存
'''
data = dict(item)
table = self.db[self.mongo_table]
table.insert_one(data)
保存到数据库第二种方法
import pymongo
from scrapy.conf import settings
class QidianPipeline(object):
def process_item(self, item, spider):
return item
class MongoPipeline():
def __init__(self):
# 获取setting主机名、端口号和数据库名
host = settings['MONGO_HOST']
port = settings['MONGO_PORT']
dbname = settings['MONGO_DBNAME']
# pymongo.MongoClient(host, port) 创建MongoDB链接
client = pymongo.MongoClient(host=host,port=port)
# 指向指定的数据库
mdb = client[dbname]
# 获取数据库里存放数据的表名
self.post = mdb[settings['MONGO_TABLE']]
def process_item(self,item,spider):
data = dict(item)
self.post.insert(data)
return item
还需在settings中打开管道,并设置初始数据库信息
ITEM_PIPELINES = {
'qidian.pipelines.QidianPipeline': 300,
'qidian.pipelines.MongoPipeline':301,
}
MONGO_URL = "127.0.0.1"
MONGO_HOST = "127.0.0.1"
MONGO_PORT = 27017
MONGO_DB = "QiDian"
MONGO_DBNAME = 'QiDian2'
MONGO_TABLE = "finish"
运行
先打开mongodb数据库
通过如下代码执行scrapy爬虫。
scrapy crawl finish