scrapy的应用
git地址:https://github.com/PlanWMan/leopard/tree/guaziScrapy
1:新建scrapy
获取的目录
新建爬虫文件
一个基本的scrapy工程就新建完成了
2:对于settings的基本配置
ROBOTSTXT_OBEY 协议要关闭
COOKIES_ENABLED的设置
两个自带的管道FilesPipeline 和 ImagesPipeline都可以重写
project_dir = os.path.abspath(os.path.dirname(__file__))
FILES_STORE = os.path.join(project_dir, "files") # 下载地址
FILES_URLS_FIELD = 'file_urls' # 这里对应着item.py文件中的字段
FILES_RESULT_FIELD = 'files' # 同样对应item.py文件中的字段
# 120 days of delay for files expiration
FILES_EXPIRES = 30
# 对图片下载的配置
IMAGES_STORE = os.path.join(project_dir, "images") # 路径
IMAGES_URLS_FIELD = "cimage_urls"
IMAGES_RESULT_FIELD = "cimages"
IMAGES_THUMBS = {
'small': (50, 50),
'big': (270, 270),
} # 设置图片大小
IMAGES_EXPIRES = 30 # 过期时间
class FilePipeline(object):
def __init__(self):
self.file = open('papers.json', 'w') # wb是二进制的会报错
def process_item(self, item, spider):
if item['title']:
line = json.dumps(dict(item)) + '\n'
self.file.write(line)
return item
else:
raise DropItem("missing title in %s" % item)
class MyImagesPipeline(ImagesPipeline):
# 可以重写images管道中的方法,比如命名
def file_path(self, request, response=None, info=None):
image_guid = md5(request.url.encode()).hexdigest()
return 'full/%s.jpg' % (image_guid)
要在item中配置好
配置中间件要写写middlewares.py(主要是用于更改ip,ua, cookie等)
class SeleniumMiddleware(object):
'''
调用selenium请求cookie考虑到资源问题只在请求首页的时候使用
把cookie写入到文件夹保存
'''
def __init__(self):
super(SeleniumMiddleware, self).__init__()
file = open("./cookies.txt", "r")
self.cookie_text = file.readline()
def process_request(self, request, spider):
if spider.name == 'guazi':
request.headers[
"User-Agent"] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'
if self.cookie_text:
request.headers["Cookie"] = self.cookie_text
return None
def process_response(self, request, response, spider):
'''
# selenium请求了网页 获得的Response 因此process_request不需要再返回Request给Downloader
# request: 响应对象所对应的请求对象
# response: 拦截到的响应对象
# spider: 爬虫文件中对应的爬虫类 WangyiSpider 的实例对象, 可以通过这个参数拿到 WangyiSpider 中的一些属性或方法
'''
if response.status != 200:
# 获取新的cookie
spider.browser.get(url=request.url)
sleep(2) # 延时3s 待网页完全加载
cook = "; ".join(["%s=%s" % (cookie['name'], cookie['value']) for cookie in spider.browser.get_cookies()])
row_response = spider.browser.page_source
with open("./cookies.txt", 'w') as file_object:
file_object.write(cook)
return HtmlResponse(url=spider.browser.current_url, body=row_response, encoding="utf8", request=request)
else:
return response
# 请求出错了的操作, 比如ip被封了,可以在这里设置ip代理
def process_exception(self, request, exception, spider):
pass
# print("添加代理开始")
# ret_proxy = get_proxy()
# request.meta["proxy"] = ret_proxy
# print("为%s添加代理%s" % (request.url, ret_proxy), end="")
# return None
scrapy-redis的配置
# redis配置
# 使用scrapy_redis的调度器
SCHEDULER = 'scrapy_redis.scheduler.Scheduler'
# 在redis中保持scrapy-redis用到的各个队列,从而允许暂停和暂停后回复
SCHEDULER_PERSIST = True
# 设置重爬
# SCHEDULER_FLUSH_ON_START = True
# 使用redis的去重方式
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
LOG_LEVEL = 'DEBUG'
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
# 'guaziSpider.pipelines.FilePipeline': 1,
# 'guaziSpider.pipelines.MyImagesPipeline': 1,
'scrapy_redis.pipelines.RedisPipeline': 100 ,
'guaziSpider.pipelines.GuaZiMongoPipline': 300,
}
存入库中列表分别表示
而且写的时候要注意对象一定要放在for中