import argparse,os,datetime,logging,time
from urllib.parse import quote
from scrapy.cmdline import execute
from crawler import settings
now=datetime.datetime.now()
# 设置日志
logger=logging.getLogger('crawler.py')
logger.setLevel(logging.INFO)
rq=time.strftime('%Y%m%d%H%M',time.localtime(time.time()))
log_path=os.path.dirname(os.getcwd())+'/Logs/'
log_name=log_path+rq+'.log'
logfile=log_name
fh=logging.FileHandler(logfile,mode='w')
fh.setLevel(logging.DEBUG)
formatter=logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s")
fh.setFormatter(formatter)
logger.addHandler(fh)
def main():
# 判断所选择的爬虫是哪一个
if FLAGS.type=='BaiduKeySearch':
# 如果要求限制爬取的最大数量,执行scrapy自带的限制爬取数量的命令
if FLAGS.max_nb_docs:
execute("scrapy crawl {} -a key_name={} -s CLOSESPIDER_ITEMCOUNT={}".format(FLAGS.type,quote(FLAGS.key),FLAGS.max_nb_docs).split())
else:
execute("scrapy crawl {} -a key_name={}".format(FLAGS.type,quote(FLAGS.key)).split())
elif FLAGS.type=='baike':
if FLAGS.max_nb_docs:
execute('scrapy crawl baike -s CLOSESPIDER_ITEMCOUNT={}'.format(FLAGS.max_nb_docs).split())
else:
execute('scrapy crawl {}'.format(FLAGS.type).split())
else:
logger.warning('Wrong Type!')
if __name__ == '__main__':
# 创建命令行解析对象
parser=argparse.ArgumentParser()
# 添加命令行参数
parser.add_argument('-t','--type',choices=['BaiduKeySearch','baike'],dest='type',help='choose one spider in choices list')
parser.add_argument('-k','--key',dest='key',type=str,help='add the key you want to search')
parser.add_argument('-m','--max-deepth',dest='max_deepth',type=int,help='set the maximum deepth you want to crawl')
parser.add_argument('-max-nb-docs',dest='max_nb_docs',type=int,help='set the maximum quantity of items you want to crawl')
parser.add_argument('-o','--output',dest='output',type=str,default=os.path.abspath('.')+'/crawled_data/',help='set the downloading output path')
parser.add_argument('-l','--log',dest='log',type=str,default=os.path.abspath('.')+'/log/',help='set the log file path ')
parser.add_argument('-f','--filename',dest='filename',type=str,help='recording the downloaded filename in this file')
# 解析
FLAGS, unparsed = parser.parse_known_args()
settings.KEY_NAME=FLAGS.key
settings.DEPTH_LIMIT=FLAGS.max_deepth
settings.OUT_PUT=FLAGS.output
settings.SHA_ONE=FLAGS.filename
settings.LOG_FILE=FLAGS.log+'scrapy {} {} {}.log'.format(now.year,now.month,now.day)
main()
oo1
最新推荐文章于 2024-04-02 15:45:28 发布