003

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

import os
import oss2
import logging
import hashlib
import datetime
import zipfile
from scrapy.conf import settings


# 该函数将url转化为md5值,用于压缩存储,返回值是url的16位md5值
def creat_md5(url):
    m = hashlib.md5()
    m.update(url.encode('utf-8'))
    # 返回16位的md5值(默认是32位的,为了进一步压缩存储空间,添加[8:-8]改为16位)
    return m.hexdigest()[8:-8]

# 该函数将已经爬取下来存储的url文件中所有url转为放在集合当中,以便去重。返回值是url的md5值集合
def creat_set(file_path):
    url_st = set()
    for file in os.listdir(file_path):
        with open(file_path+'/'+file,'r') as f:
            urls = f.readlines()
            for url in urls:
                url_st.add(creat_md5(url.rstrip('\n')))
    return url_st

# 该函数用于命名文件,名称包含搜索名词+爬虫启动时间或者只包含启动时间
def time_now():
    key_value=settings.get('KEY_NAME')
    if key_value==None:
        return datetime.datetime.now().strftime('%Y-%m-%d-%H-%M')
    else:
        return key_value+datetime.datetime.now().strftime('%Y-%m-%d-%H-%M')
     # settings里获取登陆oss的相关信息
OSS_ACCESS_KEY_ID =settings.get('OSS_ACCESS_KEY_ID')
OSS_ACCESS_KEY_SECRET =settings.get('OSS_ACCESS_KEY_SECRET')
bucket_name =settings.get('BUCKET_NAME')
endpoint= settings.get('ENDPOINT')
# 该函数是上传文件到oss上
def upload_file(local_file,ossdir):
    auth=oss2.Auth(OSS_ACCESS_KEY_ID,OSS_ACCESS_KEY_SECRET)
    bucket=oss2.Bucket(auth,endpoint,bucket_name)
    result=bucket.put_object_from_file(ossdir,local_file)
    if result.status==200:
        logging.info("{}Upload succeed".format(local_file))
    else:
        logging.warning("{}Fail upload,status:{}".format(local_file,result.status))


class CrawlerPipeline(object):

    def __init__(self):
        # 该集合包含已经爬取的百度百科的url
        self.url_st1=creat_set(os.path.abspath('.') + '/crawled_urls/bdbk')
        # 该集合包含已经爬取的百度元搜索的url
        self.url_st2=creat_set(os.path.abspath('.') + '/crawled_urls/bdks')
        self.time_stamp2=1
        self.time_now=time_now()

    # 爬虫开始
    def open_spider(self, spider):
        logging.info("{}爬虫开始啦".format(self.time_now))

      # 爬虫开始
    def open_spider(self, spider):
        logging.info("{}爬虫开始啦".format(self.time_now))

    # 处理item,导出数据
    def process_item(self, item, spider):
        #多个爬虫在同一框架下运行时,判断是哪一个爬虫
        if spider.name == 'baike':
            if creat_md5(str(item['url'])) not in self.url_st1:
                # 打开一个文件,该文件用于写入已经爬取的百度百科url
                with open('{}bdbk/crawled_url{}'.format(settings.get('URL_STORED'),str(self.time_stamp2)), 'a') as f:
                    f.write(item['url'] + '\n')
                # 计算并判断该url文件的大小,如果文件超过限制尺寸,则更换新的文件
                Size1 = os.path.getsize('{}bdbk/crawled_url{}'.format(settings.get('URL_STORED'),str(self.time_stamp2))) / (1024 ** 2)
                if Size1 > 500:
                    self.time_stamp2 += 1
                # 将新增的url放入已经爬取的url集合中
                self.url_st1.add(creat_md5(str(item['url'])))

            # 导出item
            with open('{}BDBK/{}.dat'.format(settings.get('OUT_PUT'),self.time_now),'a') as f:
                f.write(item['url']+'\n'+item['origin_length']+'\n'+item['compressed_length']+'\n'+item['compressed_html']+'\n'+"TML_DOC_END"+'\n')
            Size2=os.path.getsize('{}BDBK/{}.dat'.format(settings.get('OUT_PUT'),self.time_now))/(1024**2)
            if Size2>10:
                zipfile.ZipFile('{}BDBK/{}.zip'.format(settings.get('OUT_PUT'),self.time_now),'w',zipfile.ZIP_DEFLATED).write('{}BDBK/{}.dat'.format(settings.get('OUT_PUT'),self.time_now))
                upload_file('{}BDBK/{}.zip'.format(settings.get('OUT_PUT'),self.time_now),'{}.zip'.format(self.time_now))
                                                                                                                              56,0-1        68%
    with open('{}BDKS/{}.dat'.format(settings.get('OUT_PUT'),self.time_now), 'a') as f:
                    f.write(item['url']+'\n'+item['origin_length']+'\n'+item['compressed_length']+'\n'+item['compressed_html']+'\n'+"TML_DOC_END"+'\n')
                Size2=os.path.getsize('{}BDKS/{}.dat'.format(settings.get('OUT_PUT'),self.time_now))/(1024**2)
                if Size2>500:
                    zipfile.ZipFile('{}BDKS/{}.zip'.format(settings.get('OUT_PUT'),self.time_now),'w',zipfile.ZIP_DEFLATED).write('{}BDKS/{}.dat'.format(settings.get('OUT_PUT'),self.time_now))
                    upload_file('{}BDKS/{}.zip'.format(settings.get('OUT_PUT'),self.time_now),'{}.zip'.format(self.time_now))
                    self.time_now=time_now()
                return item

    # 爬虫关闭
    def close_spider(self, spider):
        if spider.name=='baike':
            logging.info("{}爬虫结束啦!!!!!!".format(self.time_now))
        else:
            zipfile.ZipFile('{}BDKS/{}.zip'.format(settings.get('OUT_PUT'),self.time_now),'w',zipfile.ZIP_DEFLATED).write('{}BDKS/{}.dat'.format(settings.get('OUT_PUT'),self.time_now))
            upload_file('{}BDKS/{}.zip'.format(settings.get('OUT_PUT'), self.time_now), '{}.zip'.format(self.time_now))
            with open('{}bdks_mark/{}'.format(settings.get('OUT_PUT'),settings.get('SHA_ONE')), 'a') as f:
                f.write(self.time_now +'.zip'+'\n')
            logging.info("{}爬虫结束啦!!!!!!!!!".format(self.time_now))
                                                                                                                    1,13         顶端

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值