当文章不存在时,保存到D:/知乎/latest, 当文件存在时,如果在latest中,就移动到past中
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pdfkit
import os
import shutil
class ZhihudailyPipeline(object):
def process_item(self, item, spider):
try:
filename = os.path.basename(item["filename"])
dirname = os.path.dirname(item["filename"])
print('2')
if not self.file_exsists(filename, dirname):
print('*'*20)
print(item["filename"], "downloding successfully")
print('*'*20)
print('\n')
pdfkit.from_url(item["url"], dirname+r'/latest/'+filename)
else:
print('*'*20)
print("文件已存在")
print('*'*20)
print('\n')
for _r, _d, files in os.walk("D:/知乎日报/latest/"):
if filename in files:
shutil.move("D:/知乎日报/latest/"+filename, "D:/知乎日报/past/")
print("文件移到past\n")
except:
# 此处一个Exit with code 1 due to network error: ContentNotFoundError异常
# 此异常为是因为css文件引用了外部的资源,如:字体,图片,iframe加载等。
# 选择忽略此异常
pass
return item
def file_exsists(self, filename, dirname):
for root, dirs, files in os.walk(dirname):
if filename in files:
print(filename, "exsisted\n")
return True
print(filename, "not exsist!\n")
return False
但是这样有个BUG:如果这次爬知乎日报的文章不久被知乎从首页拿掉了,这篇文章就永远停留在latest中了