一个偶然写的模板
# 此文件用来启动指定爬虫
import configparser as cps
import os, time, sLogin, sys, base64
from scrapy import cmdline
# 配置文件目录
ini_path = "E:\Code\Zhihu3.0\huxijun.ini"
class Scp():
def makedir(self, conf, DirName):
"""判断目录是否存在,不存在则创建, 最后返回路径"""
thePath = os.path.join(self.Root_path, conf["path"][DirName])
if not os.path.exists(thePath): os.makedirs(thePath)
return thePath
def __init__(self):
"""检查能否登录,并返回登录者的id,name,以及cookies"""
# 加载配置文件
conf = cps.ConfigParser()
conf.read(ini_path, encoding='utf-8')
if str.isdigit(conf.get("path", "Root_Dir")):
ls = conf.getint("path", "Root_Dir") + 1
self.Root_path = os.path.dirname(__file__)
while (ls := ls - 1):
self.Root_path = os.path.dirname(self.Root_path)
else:
self.Root_path = conf.get("path", "Root_Dir")
self.conf_dir = self.makedir(conf, 'Conf_DirName')
self.print_log = conf.getboolean("run", "printlog")
self.ok_pause = conf.getboolean("run", "ok_pause")
# Data_dir = self.makedir(conf, 'Data_DirName')
# Pro_dir = self.makedir(conf, 'Pro_DirName')
# Log_dir = self.makedir(conf, 'Log_DirName')
# 自动登录
# cookies格式:账号+后缀+".JSON"
cookies_path = os.path.join(self.conf_dir, conf['account']['name'] + conf['path']['Cookies_Suffix'] + ".JSON")
info = sLogin.login(cookies_path)
if info["alt"]:
self._cookies = info['cookies']
self._name = info['name']
self._id = info['id']
print(f"用户 {info['name']} 登录成功,ID = {info['id']}")
else:
print(f"登录失败,错误码:{info['code']}")
sys.exit()
def get_py_names(self):
"""检索该目录下可执行的爬虫文件路径"""
# print(f"正在检索目录:{self.Root_path}")
FileName = []
def get_dir(dir, dirname = "spiders"):
for adir in os.listdir(dir):
adir = os.path.join(dir, adir)
dirname = os.path.join(dir, dirname)
if adir == dirname:
for file in os.listdir(adir):
if file != "__init__.py" and os.path.splitext(file)[1] == ".py":
FileName.append(os.path.join(adir, file))
else:
if os.path.isdir(adir):
get_dir(adir)
get_dir(self.Root_path)
return FileName
def run_py(self, pyname):
"""运行指定爬虫文件, 参数是文件名"""
for filename in self.get_py_names():
if pyname == os.path.split(filename)[1]:
self._run_spider(filename, pyname)
break
else:
print("项目文件不存在!get_py_names() 可以查看可执行的项目文件")
sys.exit()
def _run_spider(self, filefullname, pyname):
""" 运行爬虫文件, 参数是文件路径, 该方法仅self.run_py()函数可以调用"""
print(f"项目路径:{filefullname}")
workdir = os.path.split(filefullname)[0]
# 切换到工作目录
os.chdir(workdir)
# 获取没有后缀的文件名
filename = os.path.splitext(pyname)[0]
# 构造执行语句
sql = "scrapy crawl "+filename
print(self.print_log)
if not self.print_log:sql+=" --nolog"
if self.ok_pause: sql+=("-s JOBDIR="+self.conf_dir)
# cookies 中不能有空格
cookies = base64.b64encode(self._cookies.encode()).decode()
sql+= f" -a id={self._id} -a name={self._name} -a cookies={cookies}"
# print(sql)
cmdline.execute(sql.split())
if __name__ == '__main__':
spider = Scp()
spider.run_py("RedisToMongoDB.py")
那么如何接收多个参数呢:
# RedisToMongoDB.py
import scrapy
class RedistomongodbSpider(scrapy.Spider):
name = 'RedisToMongoDB'
allowed_domains = ['zhihu.com']
start_urls = ['http://zhihu.com/']
def __init__(self, id, name, cookies, *args, **kwargs):
super().__init__(*args, **kwargs)
# print(f"{id=}")
# print(f"{name=}")
# print(f"{cookies=}")
def parse(self, response):
pass