关于如何在使用scrapy时传入自定义参数~
- 启动的commands文件夹(跟spider文件夹同一级)里run.py文件:
import scrapy.commands.crawl as crawl
from scrapy.exceptions import UsageError
from scrapy.commands import ScrapyCommand
# noinspection PyPackageRequirements
from dianping.settings import DB_CONF_PATH
from dianping.commands import config
import logging
logger = logging.getLogger(__name__)
class Command(crawl.Command):
db_conf = config.read_config(DB_CONF_PATH)
redis_pwd = db_conf.get('redis_passwd')
redis_host = db_conf.get('redis_host')
redis_post = db_conf.get('redis_port')
def syntax(self):
return "<dianping> <spider>"
def short_desc(self):
return "Crawl dianping"
def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE",
help="set spider argument (may be repeated)")
parser.add_option("-o", "--output", metavar="FILE",
help="dump scraped items into FILE (use - for stdout)")
parser.add_option("-t", "--output-format", metavar="FORMAT",
help="format to use for dumping items with -o")
parser.add_option("-f", "--spider_config", nargs=1, type="str", dest="spider_config", default="",
help="The name of the configuration file")
def run(self, args, opts):
spider_config_path = opts.spider_config
spider_conf = config.read_config(spider_config_path)
redis_db = spider_conf.get('redis_db')
id_flush = spider_conf.get('flush_redis')
redis_url = 'redis://:{}@{}:{}/{}'.format(self.redis_pwd, self.redis_host, self.redis_post, redis_db)
self.settings.set('SPIDER_CONF', spider_config_path)
self.settings.set('REDIS_URL', redis_url)
logger.info('******使用Redis数据库序号为 %s ******' %redis_db)
if id_flush in ['True', 'true']:
self.settings.set('SCHEDULER_FLUSH_ON_START', True)
if len(args) < 1:
raise UsageError()
elif len(args) > 1:
raise UsageError("running 'scrapy crawl' with more than one spider is no longer supported")
spider_name = args[0]
self.crawler_process.crawl(spider_name, **opts.spargs)
self.crawler_process.start()
实际上,其实就是scrapy原文件里的commands目录下的crawl.py文件的修改版!!!
划重点来啦!!!
def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE",
help="set spider argument (may be repeated)")
parser.add_option("-o", "--output", metavar="FILE",
help="dump scraped items into FILE (use - for stdout)")
parser.add_option("-t", "--output-format", metavar="FORMAT",
help="format to use for dumping items with -o")
parser.add_option("-f", "--spider_config", nargs=1, type="str", dest="spider_config", default="",
help="The name of the configuration file")
在看run函数
def run(self, args, opts):
spider_config_path = opts.spider_config
spider_conf = config.read_config(spider_config_path)
-
就是说通过 scrapy内置的函数 opts 把 我们配置文件的路径传递过来 具体怎么实现我也不懂~嘿嘿
-
read_config 就是config文件下的一个函数 将我们配置文件下的 参数 转换成键值对!
-
def read_config(path, conf=None): if conf is None: conf = {} with open(path, encoding='utf-8') as f: for line in f: if not line == '\n': li = line.strip() if not li.startswith('#'): try: key, value = li.split(':', 1) conf[key] = value except Exception as e: print(e) print(li) return conf
再往下就是 设置redis的以及一些源码了 不多解释了
-
redis_db = spider_conf.get('redis_db') id_flush = spider_conf.get('flush_redis') redis_url = 'redis://:{}@{}:{}/{}'.format(self.redis_pwd, self.redis_host, self.redis_post, redis_db) self.settings.set('SPIDER_CONF', spider_config_path) self.settings.set('REDIS_URL', redis_url) logger.info('******使用Redis数据库序号为 %s ******' %redis_db) if id_flush in ['True', 'true']: self.settings.set('SCHEDULER_FLUSH_ON_START', True) if len(args) < 1: raise UsageError() elif len(args) > 1: raise UsageError("running 'scrapy crawl' with more than one spider is no longer supported") spider_name = args[0] self.crawler_process.crawl(spider_name, **opts.spargs) self.crawler_process.start()
其一配置文件结构:
#数据存储:
mongo_db:dianping_pro
mongo_col:ods_dianping_around
#爬虫request存储数据库的编号
redis_db:13
#分类信息
food:ch35
#城市列表
#一线城市列表
city:['beijing','shanghai','tianjin','shenzhen','guangzhou','chengdu','hangzhou','chongqing','wuhan','suzhou','xian','nanjing','zhengzhou','changsha','shenyang','qingdao','ningbo','dongguan','wuxi']
#二线城市列表
#city:['kunming','dalian','xiamen','hefei','foshan','fuzhou','jinan','wenzhou','changchun','shijiazhuang','changzhou','quanzhou','nanning','guiyang','nanchang','nantong','jinhua','xuzhou','taiyuan','jiaxing','yantai','huizhou','baoding','taizhou','zhongshan','shaoxing','weifang','lanzhou']
#评分
score_one:总分
score_two:环境
score_three:服务
继续在settings里面把 外部传参开启和数据库相关配置路径
# 开启外部传参
COMMANDS_MODULE = 'dianping.commands'
DB_CONF_PATH = '/home/dy-data/database_config/database_config.txt'
接下来看看pipelines
# -*- coding: utf-8 -*-
import time
import pymongo
import pymysql
import logging
from urllib import parse
from dianping.commands import config
from scrapy_redis import connection
logger = logging.getLogger(__name__)
class DianpingPipeline(object):
def __init__(self, db_conf,spider_conf,redis_server):
self.sql_host = db_conf.get('mysql_host')
self.sql_port = int(db_conf.get('mysql_port'))
self.sql_pwd = db_conf.get('mysql_passwd')
self.sql_user = db_conf.get('mysql_account')
self.sql_map_db = db_conf.get('sql_cat_map_db')
self.sql_map_table = db_conf.get('sql_cat_map_table')
self.mongo_host = db_conf.get('mongo_host')
self.mongo_port = db_conf.get('mongo_port')
self.mongo_user = db_conf.get('mongo_account')
self.mongo_pwd = parse.quote(db_conf.get('mongo_passwd'))
self.mongo_db = spider_conf.get('mongo_db')
self.mongo_col = spider_conf.get('mongo_col')
self.sql_db = spider_conf.get('mysql_list_db')
self.sql_table = spider_conf.get('mysql_list_table')
self.sql_select = spider_conf.get('get_id_sql')
self.kind = spider_conf.get('food')
self.city = spider_conf.get('city')
self.score_one = spider_conf.get('score_one')
self.score_two = spider_conf.get('score_two')
self.score_three = spider_conf.get('score_three')
self.keyword = spider_conf.get('keyword')
self.spider_level = spider_conf.get('spider_level', 'deputy')
self.redis_server = redis_server
# @classmethod
# def from_crawler(cls, crawler):
# return cls(crawler.settings)
@classmethod
def from_crawler(cls, crawler):
db_conf_path = crawler.settings.get('DB_CONF_PATH')
spider_conf_path = crawler.settings.get('SPIDER_CONF')
return cls(
db_conf=config.read_config(db_conf_path),
spider_conf=config.read_config(spider_conf_path),
redis_server=connection.from_settings(crawler.settings)
)
def open_spider(self, spider):
kind = self.kind
city = self.city
score_one = self.score_one
score_two = self.score_two
score_three = self.score_three
spider.kind = kind
spider.city = city
spider.score_one = score_one
spider.score_two = score_two
spider.score_three = score_three
if self.sql_db:
self.conn = pymysql.connect(self.sql_host, self.sql_user, self.sql_pwd, port=self.sql_port, charset='utf8')
self.cursor = self.conn.cursor()
if self.mongo_db:
mongo_uri = 'mongodb://{}:{}@{}:{}/'.format(self.mongo_user, self.mongo_pwd, self.mongo_host, self.mongo_port)
self.client = pymongo.MongoClient(mongo_uri)
self.db = self.client[self.mongo_db]
if self.sql_db:
self.cursor.execute("USE {}".format(self.sql_db))
def process_item(self, item, spider):
# self.db[self.mongo_col].insert(dict(item))
#
# return item
if self.mongo_col:
self.db[self.mongo_col].insert(dict(item))
elif self.sql_table:
self.insert_sql(dict(item))
else:
logger.error('*** 未指定存储方式!***')
return item
def insert_sql(self, data):
keys = ', '.join(data.keys())
values = ', '.join(['%s'] * len(data))
sql = 'insert into `%s` (%s) values (%s)' % (self.sql_table, keys, values)
try:
self.cursor.execute(sql, tuple(data.values()))
self.conn.commit()
except Exception as e:
self.conn.rollback()
if e.args[0] != 1062: # 过滤重复插入的报错
logging.error(e)
def close_spider(self, spider):
# today_no_bar = time.strftime("%Y%m%d", time.localtime(time.time()))
# logger.warning('mongo:meituan_hotel_{}_job_done'.format(today_no_bar)) #输出结束语句
if self.mongo_db:
self.client.close()
if self.sql_db:
self.conn.close()
ok 继续划重点
def __init__(self, db_conf,spider_conf,redis_server):
self.sql_host = db_conf.get('mysql_host')
self.sql_port = int(db_conf.get('mysql_port'))
self.sql_pwd = db_conf.get('mysql_passwd')
self.sql_user = db_conf.get('mysql_account')
self.sql_map_db = db_conf.get('sql_cat_map_db')
self.sql_map_table = db_conf.get('sql_cat_map_table')
self.mongo_host = db_conf.get('mongo_host')
self.mongo_port = db_conf.get('mongo_port')
self.mongo_user = db_conf.get('mongo_account')
self.mongo_pwd = parse.quote(db_conf.get('mongo_passwd'))
self.mongo_db = spider_conf.get('mongo_db')
self.mongo_col = spider_conf.get('mongo_col')
self.sql_db = spider_conf.get('mysql_list_db')
self.sql_table = spider_conf.get('mysql_list_table')
self.sql_select = spider_conf.get('get_id_sql')
self.kind = spider_conf.get('food')
self.city = spider_conf.get('city')
self.score_one = spider_conf.get('score_one')
self.score_two = spider_conf.get('score_two')
self.score_three = spider_conf.get('score_three')
self.keyword = spider_conf.get('keyword')
self.spider_level = spider_conf.get('spider_level', 'deputy')
self.redis_server = redis_server
@classmethod
def from_crawler(cls, crawler):
db_conf_path = crawler.settings.get('DB_CONF_PATH')
spider_conf_path = crawler.settings.get('SPIDER_CONF')
return cls(
db_conf=config.read_config(db_conf_path),
spider_conf=config.read_config(spider_conf_path),
redis_server=connection.from_settings(crawler.settings)
)
-
常用scrapy的朋友应该知道,spider、downloadmiddleware以及pipeline中经常使用from_crawler来传递参数
-
可以直接crawler.settings获得参数
-
spider的初始化是在Crawler类内完成的。而且是调用Spider内的类方法from_crawler()初始化spider的
-
具体自己百度一下吧
def open_spider(self, spider):
kind = self.kind
city = self.city
score_one = self.score_one
score_two = self.score_two
score_three = self.score_three
spider.kind = kind
spider.city = city
spider.score_one = score_one
spider.score_two = score_two
spider.score_three = score_three
if self.sql_db:
self.conn = pymysql.connect(self.sql_host, self.sql_user, self.sql_pwd, port=self.sql_port, charset='utf8')
self.cursor = self.conn.cursor()
if self.mongo_db:
mongo_uri = 'mongodb://{}:{}@{}:{}/'.format(self.mongo_user, self.mongo_pwd, self.mongo_host, self.mongo_port)
self.client = pymongo.MongoClient(mongo_uri)
self.db = self.client[self.mongo_db]
if self.sql_db:
self.cursor.execute("USE {}".format(self.sql_db))
def process_item(self, item, spider):
# self.db[self.mongo_col].insert(dict(item))
#
# return item
if self.mongo_col:
self.db[self.mongo_col].insert(dict(item))
elif self.sql_table:
self.insert_sql(dict(item))
else:
logger.error('*** 未指定存储方式!***')
return item
def close_spider(self, spider):
# today_no_bar = time.strftime("%Y%m%d", time.localtime(time.time()))
# logger.warning('mongo:meituan_hotel_{}_job_done'.format(today_no_bar)) #输出结束语句
if self.mongo_db:
self.client.close()
if self.sql_db:
self.conn.close()
程序运行方式 ~
scrapy run 爬虫名字 -f 配置文件路径
定时任务
-
新建一个 shell 脚本
-
# 统一的日志日期: date_log=$(date +\%Y\%m\%d) # shell 脚本启动日期 cd /home/dy-data/data/spider/dianping scrapy run pro -f ./dianping/config/dianping_config/dianping_food_config.txt &> /home/dy-data/log/${date_log}/spider/dianping_food_detail_category_1.log & p1=$! scrapy run pro -f ./dianping/config/dianping_config/dianping_food_config.txt &> /home/dy-data/log/${date_log}/spider/dianping_food_detail_category_2.log & p2=$! scrapy run pro -f ./dianping/config/dianping_config/dianping_food_config.txt &> /home/dy-data/log/${date_log}/spider/dianping_food_detail_category_3.log & p3=$! # p1,p2,p3 是为 爬虫命名个临时变量 wait $p1 wait $p2 wait $p3 #date_clean=$(date +\%Y\%m\%d) # 等待 p1,p2,p3 爬虫执行完毕 自动执行清洗脚本 cd /home/dy-data/data/data_cleaning/dianping_clean && python3 clean_food_detail.py ${date_log} &> /home/dy-data/log/${date_log}/data_cleaning/dianping_clean_food_detail.log & #p4=$! ## ${date_log} 为 引用变量~
-
如果看了还是不懂就去百度一下shell语言 一看就懂啦~
-
-
命令行 crontab -e 设置 命令
-
22 12 * * * cd shell脚本路径文件夹 && sh cron_dianping_food.sh(shell脚本)
-
分钟 小时 日 月 星期 为 前边的赋值顺序
-