# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import os
import oss2
import logging
import hashlib
import datetime
import zipfile
from scrapy.conf import settings
# 该函数将url转化为md5值,用于压缩存储,返回值是url的16位md5值
def creat_md5(url):
m = hashlib.md5()
m.update(url.encode('utf-8'))
# 返回16位的md5值(默认是32位的,为了进一步压缩存储空间,添加[8:-8]改为16位)
return m.hexdigest()[8:-8]
# 该函数将已经爬取下来存储的url文件中所有url转为放在集合当中,以便去重。返回值是url的md5值集合
def creat_set(file_path):
url_st = set()
for file in os.listdir(file_path):
with open(file_path+'/'+file,'r') as f:
urls = f.readlines()
for url in urls:
url_st.add(creat_md5(url.rstrip('\n')))
return url_st
# 该函数用于命名文件,名称包含搜索名词+爬虫启动时间或者只包含启动时间
def time_now():
key_value=settings.get('KEY_NAME')
if key_value==None:
return datetime.datetime.now().strftime('%Y-%m-%d-%H-%M')
else:
return key_value+datetime.datetime.now().strftime('%Y-%m-%d-%H-%M')
# settings里获取登陆oss的相关信息
OSS_ACCESS_KEY_ID =settings.get('OSS_ACCESS_KEY_ID')
OSS_ACCESS_KEY_SECRET =settings.get('OSS_ACCESS_KEY_SECRET')
bucket_name =settings.get('BUCKET_NAME')
endpoint= settings.get('ENDPOINT')
# 该函数是上传文件到oss上
def upload_file(local_file,ossdir):
auth=oss2.Auth(OSS_ACCESS_KEY_ID,OSS_ACCESS_KEY_SECRET)
bucket=oss2.Bucket(auth,endpoint,bucket_name)
result=bucket.put_object_from_file(ossdir,local_file)
if result.status==200:
logging.info("{}Upload succeed".format(local_file))
else:
logging.warning("{}Fail upload,status:{}".format(local_file,result.status))
class CrawlerPipeline(object):
def __init__(self):
# 该集合包含已经爬取的百度百科的url
self.url_st1=creat_set(os.path.abspath('.') + '/crawled_urls/bdbk')
# 该集合包含已经爬取的百度元搜索的url
self.url_st2=creat_set(os.path.abspath('.') + '/crawled_urls/bdks')
self.time_stamp2=1
self.time_now=time_now()
# 爬虫开始
def open_spider(self, spider):
logging.info("{}爬虫开始啦".format(self.time_now))
# 爬虫开始
def open_spider(self, spider):
logging.info("{}爬虫开始啦".format(self.time_now))
# 处理item,导出数据
def process_item(self, item, spider):
#多个爬虫在同一框架下运行时,判断是哪一个爬虫
if spider.name == 'baike':
if creat_md5(str(item['url'])) not in self.url_st1:
# 打开一个文件,该文件用于写入已经爬取的百度百科url
with open('{}bdbk/crawled_url{}'.format(settings.get('URL_STORED'),str(self.time_stamp2)), 'a') as f:
f.write(item['url'] + '\n')
# 计算并判断该url文件的大小,如果文件超过限制尺寸,则更换新的文件
Size1 = os.path.getsize('{}bdbk/crawled_url{}'.format(settings.get('URL_STORED'),str(self.time_stamp2))) / (1024 ** 2)
if Size1 > 500:
self.time_stamp2 += 1
# 将新增的url放入已经爬取的url集合中
self.url_st1.add(creat_md5(str(item['url'])))
# 导出item
with open('{}BDBK/{}.dat'.format(settings.get('OUT_PUT'),self.time_now),'a') as f:
f.write(item['url']+'\n'+item['origin_length']+'\n'+item['compressed_length']+'\n'+item['compressed_html']+'\n'+"TML_DOC_END"+'\n')
Size2=os.path.getsize('{}BDBK/{}.dat'.format(settings.get('OUT_PUT'),self.time_now))/(1024**2)
if Size2>10:
zipfile.ZipFile('{}BDBK/{}.zip'.format(settings.get('OUT_PUT'),self.time_now),'w',zipfile.ZIP_DEFLATED).write('{}BDBK/{}.dat'.format(settings.get('OUT_PUT'),self.time_now))
upload_file('{}BDBK/{}.zip'.format(settings.get('OUT_PUT'),self.time_now),'{}.zip'.format(self.time_now))
56,0-1 68%
with open('{}BDKS/{}.dat'.format(settings.get('OUT_PUT'),self.time_now), 'a') as f:
f.write(item['url']+'\n'+item['origin_length']+'\n'+item['compressed_length']+'\n'+item['compressed_html']+'\n'+"TML_DOC_END"+'\n')
Size2=os.path.getsize('{}BDKS/{}.dat'.format(settings.get('OUT_PUT'),self.time_now))/(1024**2)
if Size2>500:
zipfile.ZipFile('{}BDKS/{}.zip'.format(settings.get('OUT_PUT'),self.time_now),'w',zipfile.ZIP_DEFLATED).write('{}BDKS/{}.dat'.format(settings.get('OUT_PUT'),self.time_now))
upload_file('{}BDKS/{}.zip'.format(settings.get('OUT_PUT'),self.time_now),'{}.zip'.format(self.time_now))
self.time_now=time_now()
return item
# 爬虫关闭
def close_spider(self, spider):
if spider.name=='baike':
logging.info("{}爬虫结束啦!!!!!!".format(self.time_now))
else:
zipfile.ZipFile('{}BDKS/{}.zip'.format(settings.get('OUT_PUT'),self.time_now),'w',zipfile.ZIP_DEFLATED).write('{}BDKS/{}.dat'.format(settings.get('OUT_PUT'),self.time_now))
upload_file('{}BDKS/{}.zip'.format(settings.get('OUT_PUT'), self.time_now), '{}.zip'.format(self.time_now))
with open('{}bdks_mark/{}'.format(settings.get('OUT_PUT'),settings.get('SHA_ONE')), 'a') as f:
f.write(self.time_now +'.zip'+'\n')
logging.info("{}爬虫结束啦!!!!!!!!!".format(self.time_now))
1,13 顶端
003
最新推荐文章于 2024-03-17 23:14:16 发布