本系列文章涉及到的Scrapy为1.2.1版本,运行环境为py2.7。
首先我们查看一下setup.py:
entry_points={
'console_scripts': ['scrapy = scrapy.cmdline:execute']
},
可以看到,框架唯一的入口点是命令行的scrapy命令,对应scrapy.cmdline下的execute方法。
下面查看一下execute方法:
def execute(argv=None, settings=None):
if argv is None:
argv = sys.argv
# --- backwards compatibility for scrapytest.conf.settings singleton ---
if settings is None and 'scrapytest.conf' in sys.modules:
from scrapytest import conf
if hasattr(conf, 'settings'):
settings = conf.settings
# ------------------------------------------------------------------
if settings is None:
settings = get_project_settings()
check_deprecated_settings(settings)
# --- backwards compatibility for scrapytest.conf.settings singleton ---
import warnings
from scrapytest.exceptions import ScrapyDeprecationWarning
with warnings.catch_warnings():
warnings.simplefilter("ignore", ScrapyDeprecationWarning)
from scrapytest import conf
conf.settings = settings
# ------------------------------------------------------------------
inproject = inside_project()
cmds = _get_commands_dict(settings, inproject)
cmdname = _pop_command_name(argv)
parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \
conflict_handler='resolve')
if not cmdname:
_print_commands(settings, inproject)
sys.exit(0)
elif cmdname not in cmds:
_print_unknown_command(settings, cmdname, inproject)
sys.exit(2)
cmd = cmds[cmdname]
parser.usage = "scrapytest %s %s" % (cmdname, cmd.syntax())
parser.description = cmd.long_desc()
settings.setdict(cmd.default_settings, priority='command')
cmd.settings = settings
cmd.add_options(parser)
opts, args = parser.parse_args(args=argv[1:])
_run_print_help(parser, cmd.process_options, args, opts)
cmd.crawler_process = CrawlerProcess(settings)
_run_print_help(parser, _run_command, cmd, args, opts)
sys.exit(cmd.exitcode)
简便起见,代码中为了后向兼容的部分我就不再解释了,主要针对代码的主要流程和思想。
一、settings.py配置文件加载
进入函数后,首先是配置了argv和settings变量:
if argv is None:
argv = sys.argv
if settings is None:
settings = get_project_settings()
check_deprecated_settings(settings)
argv直接取sys.argv,settings则需要调用get_project_settings()函数,函数如下:
ENVVAR = 'SCRAPY_SETTINGS_MODULE'
def get_project_settings():
if ENVVAR not in os.environ:
project = os.environ.get('SCRAPY_PROJECT', 'default')
init_env(project)
settings = Settings()
settings_module_path = os.environ.get(ENVVAR)
if settings_module_path:
settings.setmodule(settings_module_path, priority='project')
# XXX: remove this hack
pickled_settings = os.environ.get("SCRAPY_PICKLED_SETTINGS_TO_OVERRIDE")
if pickled_settings:
settings.setdict(pickle.loads(pickled_settings), priority='project')
# XXX: deprecate and remove this functionality
env_overrides = {k[7:]: v for k, v in os.environ.items() if
k.startswith('SCRAPY_')}
if env_overrides:
settings.setdict(env_overrides, priority='project')
return settings
get_project_settings()函数在环境变量ENVVAR不存在时将初始化环境变量init_env(project)。至于'SCRAPY_PROJECT'是怎么写入到环境变量中的,目前还没看到。
init_env()及其相关函数:
def closest_scrapy_cfg(path='.', prevpath=None):
"""Return the path to the closest scrapytest.cfg file by traversing the current
directory and its parents
"""
if path == prevpath:
return ''
path = os.path.abspath(path)
cfgfile = os.path.join(path, 'scrapytest.cfg')
if os.path.exists(cfgfile):
return cfgfile
return closest_scrapy_cfg(os.path.dirname(path), path)
def get_sources(use_closest=True):
xdg_config_home = os.environ.get('XDG_CONFIG_HOME') or \
os.path.expanduser('