- 目录结构
- main.py文件
# -*- coding:utf-8 -*-
from scrapy import cmdline
cmdline.execute('scrapy crawl test'.split())
settings.py文件
# -*- coding: utf-8 -*-
BOT_NAME = 'mytest'
SPIDER_MODULES = ['mytest.spiders']
NEWSPIDER_MODULE = 'mytest.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'mytest (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
#随机下载延迟
RANDOM_DELAY = 2
DOWNLOADER_MIDDLEWARES = {
# 'mytest.middlewares.MytestDownloaderMiddleware': 543,
'mytest.middlewares.RandomDelayMiddleware': 100,
'mytest.middlewares.UserAgentMiddleware': 100,
# 'mytest.middlewares.ProxyMiddleware': 100,
}
# 代理IP池