这里运用了SQLAlchemy框架、ruler规则表、CrawlerRunner。具体的运用原理是,将爬虫文件中所需的参数name(爬虫名称)、allow_domains、start_urls、next_page(下一页链接)、allow_url(相当于LinkExtractor链表中的allow)、以及要爬取的标签的CSS和Xpath选择符。
下边是爬虫文件的示例代码(这里关键是def __init__()函数,通过这些连接到一块)
# -*- coding: utf-8 -*-
import scrapy
import sys
sys.path.append('G:\LianjiaScrapy\LianjiaScrapy')
sys.path.append('G:\LianjiaScrapy')
from LianjiaScrapy.utils import parse_text
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from LianjiaScrapy.items import Article
class RuletestSpider(CrawlSpider):
name = 'ruletest'
def __init__(self, rule):
self.rule = rule
print("这里是爬虫.py中的rule")
print(self.rule)
self.name = rule.name
print(self.name)
self.allowed_domains = rule.allow_domains.split(",")
print(self.allowed_domains)
self.start_urls = rule.start_urls.split(",")
print(self.start_urls)
rule_list = []
# 添加`下一页`的规则
if rule.next_page:
rule_list.append(Rule(LinkExtractor(restrict_xpaths=rule.next_page), follow=True))
print(rule.next_page)
# 添加抽取文章链接的规则
rule_list.append(Rule(LinkExtractor(
#allow=[rule.allow_url],
restrict_xpaths=[rule.extract_from]),
callback='parse_item'))
print("这个是文章链接提取区域xpath")
print(rule.extract_from)
self.rules = tuple(rule_list)
super(RuletestSpider, self).__init__()
def parse_item(self, response):
self.log('Hi, this is an article page! %s' % response.url)#
article = Article()
print("浏览分页")
print(response.url)
# article["url"] = response.url#
# title = response.xpath(self.rule.title_xpath).extract()
# article["title"] = parse_text(title, self.rule.name, 'title')#
# body = response.xpath(self.rule.body_xpath).extract()
# article["body"] = parse_text(body, self.rule.name, 'body')#
# publish_time = response.xpath(self.rule.publish_time_xpath).extract()
# article["publish_time"] = parse_text(publish_time, self.rule.name, 'publish_time')
# article["source_site"] = self.rule.source_site#
# return article
下边是model文件和pipeline文件双结合(这里是SQLAlchemy文件的综合体现),把爬虫需要的元素都定义起来,并存入数据库
import datetime
from sqlalchemy.engine.url import URL
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime
#from coolscrapy.settings import DATABASE
#
def db_connect():
return create_engine("mysql+pymysql://root:root@localhost:3306/aa",max_overflow=4)
def create_news_table(engine):
""""""
Base.metadata.create_all(engine)
def _get_date():
return datetime.datetime.now()
Base = declarative_base()
class ourselfRule(Base):
"""自定义文章爬取规则"""
__tablename__ = 'article_rule'
id = Column(Integer, primary_key=True)
# 规则名称
name = Column(String(30))
# 运行的域名列表,逗号隔开
allow_domains = Column(String(100))
# 开始URL列表,逗号隔开
start_urls = Column(String(100))
# 下一页的xpath
next_page = Column(String(100))
# 文章链接正则表达式(子串)
allow_url = Column(String(200))
# 文章链接提取区域xpath
extract_from = Column(String(200))
# 文章标题xpath
title_xpath = Column(String(100))
# 文章内容xpath
body_xpath = Column(Text)
# 发布时间xpath
publish_time_xpath = Column(String(30))
# 文章来源
source_site = Column(String(30))
# 规则是否生效
enable = Column(Integer)
pipeline文件如下:(这里的目的是存进一条规则进数据库,利用SQLALchemy的方式)
import pymysql
# 使用twsited异步IO框架,实现数据的异步写入。
#这里是引入sqlalchemy的测试
import sqlalchemy
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, ForeignKey, UniqueConstraint, Index
from sqlalchemy.orm import sessionmaker, relationship
from sqlalchemy import create_engine
from LianjiaScrapy.models import db_connect, create_news_table, Lagoujob, ourselfRule
from contextlib import contextmanager
#这里是调用SQLALchemy框架
@contextmanager
def session_scope(Session):
"""Provide a transactional scope around a series of operations."""
session = Session()
session.expire_on_commit = False
try:
yield session
session.commit()
except:
session.rollback()
raise
finally:
session.close()
class ourselfRulePipeline(object):
"""保存文章到数据库"""
def __init__(self):
engine = db_connect()
create_news_table(engine)
self.Session = sessionmaker(bind=engine)
def open_spider(self, spider):
"""This method is called when the spider is opened."""
pass
def process_item(self, item, spider):
a = ourselfRule(name="b2b",
allow_domains="www.yub2b.com",
start_urls="http://www.yub2b.com/news/list-49.html",
next_page="//div[@class='pages']/a[2]",
allow_url="r'.*/news/.*'",
extract_from="//div[@class='catlist']/ul/li/a",
title_xpath="//div[@class='catlist']/ul/li/a/text()",
publish_time_xpath="//div[@class='catlist']/ul/li/span/text()",
source_site="",
enable=1
)
with session_scope(self.Session) as session:
session.add(a)
def close_spider(self, spider):
pass
接下来就是用SQLAlchemy框架来从Mysql数据库中取出规则,然后把规则赋予到爬虫中
import logging
#这里是异步加载
from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner
from scrapy.utils.project import get_project_settings
from scrapy.utils.log import configure_logging
import sys
sys.path.append('G:\LianjiaScrapy\LianjiaScrapy')
sys.path.append('G:\LianjiaScrapy\LianjiaScrapy\spiders')
#这个相当于从models中取出SQLAlchemy框架和使用框架的方法
from models import db_connect, create_news_table, Lagoujob,ourselfRule
#这里是调用SQLALchemy的查询方法
from sqlalchemy.orm import sessionmaker
#这里是调用爬虫文件
from spiders.ruletest import RuletestSpider
#这里是代表下边的代码是内置的函数,别人不能调取
if __name__ == '__main__':
settings = get_project_settings()
configure_logging(settings)
db = db_connect()
Session = sessionmaker(bind=db)
session = Session()
rules = session.query(ourselfRule).filter(ourselfRule.enable == 1).all()
session.close()
runner = CrawlerRunner(settings)
for rule in rules:
# spider = ArticleSpider(rule) # instantiate every spider using rule
# stop reactor when spider closes
# runner.signals.connect(spider_closing, signal=signals.spider_closed)
print(rule)
print(rule.name)
print(RuletestSpider)
runner.crawl(RuletestSpider, rule=rule)
d = runner.join()
d.addBoth(lambda _: reactor.stop())
# blocks process so always keep as the last statement
reactor.run()
logging.info('all finished.')