003

最新推荐文章于 2024-03-17 23:14:16 发布

nio006

最新推荐文章于 2024-03-17 23:14:16 发布

阅读量80

点赞数

分类专栏： scrapy

本文链接：https://blog.csdn.net/weixin_43890188/article/details/88879214

版权

scrapy 专栏收录该内容

16 篇文章 0 订阅

订阅专栏

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

import os
import oss2
import logging
import hashlib
import datetime
import zipfile
from scrapy.conf import settings


# 该函数将url转化为md5值，用于压缩存储，返回值是url的16位md5值
def creat_md5(url):
    m = hashlib.md5()
    m.update(url.encode('utf-8'))
    # 返回16位的md5值（默认是32位的，为了进一步压缩存储空间，添加[8:-8]改为16位）
    return m.hexdigest()[8:-8]

# 该函数将已经爬取下来存储的url文件中所有url转为放在集合当中，以便去重。返回值是url的md5值集合
def creat_set(file_path):
    url_st = set()
    for file in os.listdir(file_path):
        with open(file_path+'/'+file,'r') as f:
            urls = f.readlines()
            for url in urls:
                url_st.add(creat_md5(url.rstrip('\n')))
    return url_st

# 该函数用于命名文件，名称包含搜索名词+爬虫启动时间或者只包含启动时间
def time_now():
    key_value=settings.get('KEY_NAME')
    if key_value==None:
        return datetime.datetime.now().strftime('%Y-%m-%d-%H-%M')
    else:
        return key_value+datetime.datetime.now().strftime('%Y-%m-%d-%H-%M')
     # settings里获取登陆oss的相关信息
OSS_ACCESS_KEY_ID =settings.get('OSS_ACCESS_KEY_ID')
OSS_ACCESS_KEY_SECRET =settings.get('OSS_ACCESS_KEY_SECRET')
bucket_name =settings.get('BUCKET_NAME')
endpoint= settings.get('ENDPOINT')
# 该函数是上传文件到oss上
def upload_file(local_file,ossdir):
    auth=oss2.Auth(OSS_ACCESS_KEY_ID,OSS_ACCESS_KEY_SECRET)
    bucket=oss2.Bucket(auth,endpoint,bucket_name)
    result=bucket.put_object_from_file(ossdir,local_file)
    if result.status==200:
        logging.info("{}Upload succeed".format(local_file))
    else:
        logging.warning("{}Fail upload,status:{}".format(local_file,result.status))


class CrawlerPipeline(object):

    def __init__(self):
        # 该集合包含已经爬取的百度百科的url
        self.url_st1=creat_set(os.path.abspath('.') + '/crawled_urls/bdbk')
        # 该集合包含已经爬取的百度元搜索的url
        self.url_st2=creat_set(os.path.abspath('.') + '/crawled_urls/bdks')
        self.time_stamp2=1
        self.time_now=time_now()

    # 爬虫开始
    def open_spider(self, spider):
        logging.info("{}爬虫开始啦".format(self.time_now))

      # 爬虫开始
    def open_spider(self, spider):
        logging.info("{}爬虫开始啦".format(self.time_now))

    # 处理item,导出数据
    def process_item(self, item, spider):
        #多个爬虫在同一框架下运行时，判断是哪一个爬虫
        if spider.name == 'baike':
            if creat_md5(str(item['url'])) not in self.url_st1:
                # 打开一个文件，该文件用于写入已经爬取的百度百科url
                with open('{}bdbk/crawled_url{}'.format(settings.get('URL_STORED'),str(self.time_stamp2)), 'a') as f:
                    f.write(item['url'] + '\n')
                # 计算并判断该url文件的大小，如果文件超过限制尺寸，则更换新的文件
                Size1 = os.path.getsize('{}bdbk/crawled_url{}'.format(settings.get('URL_STORED'),str(self.time_stamp2))) / (1024 ** 2)
                if Size1 > 500:
                    self.time_stamp2 += 1
                # 将新增的url放入已经爬取的url集合中
                self.url_st1.add(creat_md5(str(item['url'])))

            # 导出item
            with open('{}BDBK/{}.dat'.format(settings.get('OUT_PUT'),self.time_now),'a') as f:
                f.write(item['url']+'\n'+item['origin_length']+'\n'+item['compressed_length']+'\n'+item['compressed_html']+'\n'+"TML_DOC_END"+'\n')
            Size2=os.path.getsize('{}BDBK/{}.dat'.format(settings.get('OUT_PUT'),self.time_now))/(1024**2)
            if Size2>10:
                zipfile.ZipFile('{}BDBK/{}.zip'.format(settings.get('OUT_PUT'),self.time_now),'w',zipfile.ZIP_DEFLATED).write('{}BDBK/{}.dat'.format(settings.get('OUT_PUT'),self.time_now))
                upload_file('{}BDBK/{}.zip'.format(settings.get('OUT_PUT'),self.time_now),'{}.zip'.format(self.time_now))
                                                                                                                              56,0-1        68%
    with open('{}BDKS/{}.dat'.format(settings.get('OUT_PUT'),self.time_now), 'a') as f:
                    f.write(item['url']+'\n'+item['origin_length']+'\n'+item['compressed_length']+'\n'+item['compressed_html']+'\n'+"TML_DOC_END"+'\n')
                Size2=os.path.getsize('{}BDKS/{}.dat'.format(settings.get('OUT_PUT'),self.time_now))/(1024**2)
                if Size2>500:
                    zipfile.ZipFile('{}BDKS/{}.zip'.format(settings.get('OUT_PUT'),self.time_now),'w',zipfile.ZIP_DEFLATED).write('{}BDKS/{}.dat'.format(settings.get('OUT_PUT'),self.time_now))
                    upload_file('{}BDKS/{}.zip'.format(settings.get('OUT_PUT'),self.time_now),'{}.zip'.format(self.time_now))
                    self.time_now=time_now()
                return item

    # 爬虫关闭
    def close_spider(self, spider):
        if spider.name=='baike':
            logging.info("{}爬虫结束啦！！！！！！".format(self.time_now))
        else:
            zipfile.ZipFile('{}BDKS/{}.zip'.format(settings.get('OUT_PUT'),self.time_now),'w',zipfile.ZIP_DEFLATED).write('{}BDKS/{}.dat'.format(settings.get('OUT_PUT'),self.time_now))
            upload_file('{}BDKS/{}.zip'.format(settings.get('OUT_PUT'), self.time_now), '{}.zip'.format(self.time_now))
            with open('{}bdks_mark/{}'.format(settings.get('OUT_PUT'),settings.get('SHA_ONE')), 'a') as f:
                f.write(self.time_now +'.zip'+'\n')
            logging.info("{}爬虫结束啦！！！！！！！！！".format(self.time_now))
                                                                                                                    1,13         顶端

nio006

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
003

# -*- coding: utf-8 -*-# Define your item pipelines here## Don't forget to add your pipeline to the ITEM_PIPELINES setting# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.htmlimport ...
复制链接

扫一扫

专栏目录