布隆去重

最新推荐文章于 2024-03-26 08:21:05 发布

有些故事只想说给陌生人听

最新推荐文章于 2024-03-26 08:21:05 发布

阅读量432

点赞数

本文链接：https://blog.csdn.net/a10090492/article/details/80626481

版权

费劲吧啦的爬到了数据，在满心欢喜之前还有一关要过，那就是去重

是滴，一万条数据重复一千次就只有十条有效数据了

不要问我这个咋算的，我不会告诉你，总之这个时候就需要另一个英雄登场了

布隆！！！！

好吧，不是他，是另一个布隆去重

https://github.com/liyaopinner/BloomFilter_imooc

下载布隆过滤器，将其中的py_bloomfilter.py放到scrapy_redis包中

是的，这里需要用到redis

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html


class JobsspiderPipeline(object):
    def process_item(self, item, spider):
        return item

import pymysql
'同步写入数据速度比较慢,而爬虫速度比较快,可能导致数据最后写入不到数据库中'
'''
1.引入twisted.enterprise.adbapi  pymysql.cursors
2.在settings中配置数据库连接参数
3.创建pipeline,实现from_settings函数,从settings获取数据库连接参数,根据参数创建连接池对象,返回当前pipeline的对象,并且把连接池赋值给该对象属性
4.实现process_item函数,使用db_pool.runInteraction(函数,函数需要的参数) 将数据库的处理操作放入连接池s,还需要将操作数据的函数实现,使用cursor执行sql
5.拿到runInteraction()函数返回的处理结果,添加错误回调函数,在函数中将错误原因打印
'''
# Twisted  做异步任务处理的包
# adbapi 操作数据库的模块
from twisted.enterprise import adbapi
from pymysql import cursors
class MySQLTwistedPipeline(object):

    # 1.链接mysql数据库
    # from_settings 激活pipeline之后,会自动调用该函数加载settings中的配置
    @classmethod
    def from_settings(cls, settings):
        # 准备数据库的链接参数,是一个字典
        db_params = dict(
            host = settings['MYSQL_HOST'],
            user = settings['MYSQL_USER'],
            password = settings['MYSQL_PASSWD'],
            port = settings['MYSQL_PORT'],
            db = settings['MYSQL_DBNAME'],
            charset = settings['MYSQL_CHARSET'],
            use_unicode = True,
            # 指定使用的游标类型
            cursorclass= cursors.DictCursor
        )
        # 创建连接池
        # 1.使用的操作数据库的包名称
        # 2.准备的数据库链接参数
        db_pool = adbapi.ConnectionPool('pymysql',**db_params)
        # 返回创建好的对象
        return cls(db_pool)
    # 在初始化函数中,对db_pool进行赋值
    def __init__(self,db_pool):
        # 赋值
        self.db_pool = db_pool

    # 处理item的函数
    def process_item(self,item,spider):
        # 异步写入
        # 把执行sql的操作放入pool中
        # 1.执行的操作(功能函数) 函数对象 function类型
        # 2.item 对象 spider对象
        query = self.db_pool.runInteraction(self.insert_item,item)
        # 执行sql出现错误,会执行指定的回调函数
        query.addErrback(self.handle_error,item,spider)
        # 返回item
        return item

    # failure 错误原因
    def handle_error(self,failure,item,spider):
        # 输出错误原因
        print(failure)

    # 执行的操作
    def insert_item(self,cursor,item):

        sql = "INSERT INTO jobs(job_name,org_name,job_location,max_money,min_money,date)VALUES (%s,%s,%s,%s,%s,%s)"
        # 执行sql
        cursor.execute(sql,(item['job_name'],item['org_name'],item['job_location'],item['max_money'],item['min_money'],item['date']))

import mmh3
from redis import ConnectionPool,StrictRedis
import math
import time


class PyBloomFilter():
    #内置100个随机种子
    SEEDS = [543, 460, 171, 876, 796, 607, 650, 81, 837, 545, 591, 946, 846, 521, 913, 636, 878, 735, 414, 372,
             344, 324, 223, 180, 327, 891, 798, 933, 493, 293, 836, 10, 6, 544, 924, 849, 438, 41, 862, 648, 338,
             465, 562, 693, 979, 52, 763, 103, 387, 374, 349, 94, 384, 680, 574, 480, 307, 580, 71, 535, 300, 53,
             481, 519, 644, 219, 686, 236, 424, 326, 244, 212, 909, 202, 951, 56, 812, 901, 926, 250, 507, 739, 371,
             63, 584, 154, 7, 284, 617, 332, 472, 140, 605, 262, 355, 526, 647, 923, 199, 518]

    #capacity是预先估计要去重的数量
    #error_rate表示错误率
    #conn表示redis的连接客户端
    #key表示在redis中的键的名字前缀
    def __init__(self, capacity=1000000000, error_rate=0.00000001, conn=None, key='BloomFilter'):
        self.m = math.ceil(capacity*math.log2(math.e)*math.log2(1/error_rate))      #需要的总bit位数
        self.k = math.ceil(math.log1p(2)*self.m/capacity)                           #需要最少的hash次数
        self.mem = math.ceil(self.m/8/1024/1024)                                    #需要的多少M内存
        self.blocknum = math.ceil(self.mem/512)                                     #需要多少个512M的内存块,value的第一个字符必须是ascii码，所有最多有256个内存块
        self.seeds = self.SEEDS[0:self.k]
        self.key = key
        self.N = 2**31-1
        self.redis = conn
        print(self.mem)
        print(self.k)

    def add(self, value):
        name = self.key + "_" + str(ord(value[0])%self.blocknum)
        hashs = self.get_hashs(value)
        for hash in hashs:
            self.redis.setbit(name, hash, 1)

    def is_exist(self, value):
        name = self.key + "_" + str(ord(value[0])%self.blocknum)
        hashs = self.get_hashs(value)
        exist = True
        for hash in hashs:
            exist = exist & self.redis.getbit(name, hash)
        return exist

    def get_hashs(self, value):
        hashs = list()
        for seed in self.seeds:
            hash = mmh3.hash(value, seed)
            if hash >= 0:
                hashs.append(hash)
            else:
                hashs.append(self.N - hash)
        return hashs


pool = ConnectionPool(host='127.0.0.1', port=6379, db=0)
conn = StrictRedis(connection_pool=pool)

# start = time.time()
# bf = PyBloomFilter(conn=conn)
# bf.add('www.jobbole.com')
# bf.add('www.zhihu.com')
# print(bf.is_exist('www.zhihu.com'))
# print(bf.is_exist('www.lagou.com'))

# -*- coding: utf-8 -*-

# Scrapy settings for JobsSpider project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'JobsSpider'

SPIDER_MODULES = ['JobsSpider.spiders']
NEWSPIDER_MODULE = 'JobsSpider.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'JobsSpider (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'JobsSpider.middlewares.JobsspiderSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
   'JobsSpider.middlewares.CustomUAMiddleware': 1,
   'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware':None
}

# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'JobsSpider.pipelines.MySQLTwistedPipeline': 1,
   'scrapy_redis.pipelines.RedisPipeline':2
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

# 数据库地址
MYSQL_HOST = '192.168.10.220'
# 用户名
MYSQL_USER = 'root'
# 密码
MYSQL_PASSWD = '123456'
# 端口
MYSQL_PORT = 3306
# 数据库名称
MYSQL_DBNAME = 'jobs'
# 编码
MYSQL_CHARSET = 'utf8'


SCHEDULER = 'scrapy_redis.scheduler.Scheduler'

DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'

REDIS_URL = 'redis://root:@192.168.10.220:6379'