oo1

import argparse,os,datetime,logging,time
from urllib.parse import quote
from scrapy.cmdline import execute
from crawler import settings


now=datetime.datetime.now()
# 设置日志
logger=logging.getLogger('crawler.py')
logger.setLevel(logging.INFO)
rq=time.strftime('%Y%m%d%H%M',time.localtime(time.time()))
log_path=os.path.dirname(os.getcwd())+'/Logs/'
log_name=log_path+rq+'.log'
logfile=log_name
fh=logging.FileHandler(logfile,mode='w')
fh.setLevel(logging.DEBUG)
formatter=logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s")
fh.setFormatter(formatter)
logger.addHandler(fh)


def main():
    # 判断所选择的爬虫是哪一个
    if FLAGS.type=='BaiduKeySearch':
        # 如果要求限制爬取的最大数量,执行scrapy自带的限制爬取数量的命令
        if FLAGS.max_nb_docs:
            execute("scrapy crawl {} -a key_name={} -s CLOSESPIDER_ITEMCOUNT={}".format(FLAGS.type,quote(FLAGS.key),FLAGS.max_nb_docs).split())
        else:
            execute("scrapy crawl {} -a key_name={}".format(FLAGS.type,quote(FLAGS.key)).split())
    elif FLAGS.type=='baike':
        if FLAGS.max_nb_docs:
            execute('scrapy crawl baike -s CLOSESPIDER_ITEMCOUNT={}'.format(FLAGS.max_nb_docs).split())
        else:
            execute('scrapy crawl {}'.format(FLAGS.type).split())
    else:
        logger.warning('Wrong Type!')

if __name__ == '__main__':
    # 创建命令行解析对象
    parser=argparse.ArgumentParser()
                                                                                                                         
                                                                                                                          # 添加命令行参数
    parser.add_argument('-t','--type',choices=['BaiduKeySearch','baike'],dest='type',help='choose one spider in choices list')
    parser.add_argument('-k','--key',dest='key',type=str,help='add the key you want to search')
    parser.add_argument('-m','--max-deepth',dest='max_deepth',type=int,help='set the maximum deepth you want to crawl')
    parser.add_argument('-max-nb-docs',dest='max_nb_docs',type=int,help='set the maximum quantity of items you want to crawl')
    parser.add_argument('-o','--output',dest='output',type=str,default=os.path.abspath('.')+'/crawled_data/',help='set the downloading output path')
    parser.add_argument('-l','--log',dest='log',type=str,default=os.path.abspath('.')+'/log/',help='set the log file path ')
    parser.add_argument('-f','--filename',dest='filename',type=str,help='recording the downloaded filename in this file')
    # 解析
    FLAGS, unparsed = parser.parse_known_args()
    settings.KEY_NAME=FLAGS.key
    settings.DEPTH_LIMIT=FLAGS.max_deepth
    settings.OUT_PUT=FLAGS.output
    settings.SHA_ONE=FLAGS.filename
    settings.LOG_FILE=FLAGS.log+'scrapy {} {} {}.log'.format(now.year,now.month,now.day)
    main()


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值