费劲吧啦的爬到了数据,在满心欢喜之前还有一关要过,那就是去重
是滴,一万条数据重复一千次就只有十条有效数据了
不要问我这个咋算的,我不会告诉你,总之这个时候就需要另一个英雄登场了
布隆!!!!
好吧,不是他,是另一个布隆去重
https://github.com/liyaopinner/BloomFilter_imooc
下载布隆过滤器,将其中的py_bloomfilter.py放到scrapy_redis包中
是的,这里需要用到redis
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class JobsspiderPipeline(object):
def process_item(self, item, spider):
return item
import pymysql
'同步写入数据速度比较慢,而爬虫速度比较快,可能导致数据最后写入不到数据库中'
'''
1.引入twisted.enterprise.adbapi pymysql.cursors
2.在settings中配置数据库连接参数
3.创建pipeline,实现from_settings函数,从settings获取数据库连接参数,根据参数创建连接池对象,返回当前pipeline的对象,并且把连接池赋值给该对象属性
4.实现process_item函数,使用db_pool.runInteraction(函数,函数需要的参数) 将数据库的处理操作放入连接池s,还需要将操作数据的函数实现,使用cursor执行sql
5.拿到runInteraction()函数返回的处理结果,添加错误回调函数,在函数中将错误原因打印
'''
# Twisted 做异步任务处理的包
# adbapi 操作数据库的模块
from twisted.enterprise import adbapi
from pymysql import cursors
class MySQLTwistedPipeline(object):
# 1.链接mysql数据库
# from_settings 激活pipeline之后,会自动调用该函数加载settings中的配置
@classmethod
def from_settings(cls, settings):
# 准备数据库的链接参数,是一个字典
db_params = dict(
host = settings['MYSQL_HOST'],
user = settings['MYSQL_USER'],
password = settings['MYSQL_PASSWD'],
port = settings['MYSQL_PORT'],
db = settings['MYSQL_DBNAME'],
charset = settings['MYSQL_CHARSET'],
use_unicode = True,
# 指定使用的游标类型
cursorclass= cursors.DictCursor
)
# 创建连接池
# 1.使用的操作数据库的包名称
# 2.准备的数据库链接参数
db_pool = adbapi.ConnectionPool('pymysql',**db_params)
# 返回创建好的对象
return cls(db_pool)
# 在初始化函数中,对db_pool进行赋值
def __init__(self,db_pool):
# 赋值
self.db_pool = db_pool
# 处理item的函数
def process_item(self,item,spider):
# 异步写入
# 把执行sql的操作放入pool中
# 1.执行的操作(功能函数) 函数对象 function类型
# 2.item 对象 spider对象
query = self.db_pool.runInteraction(self.insert_item,item)
# 执行sql出现错误,会执行指定的回调函数
query.addErrback(self.handle_error,item,spider)
# 返回item
return item
# failure 错误原因
def handle_error(self,failure,item,spider):
# 输出错误原因
print(failure)
# 执行的操作
def insert_item(self,cursor,item):
sql = "INSERT INTO jobs(job_name,org_name,job_location,max_money,min_money,date)VALUES (%s,%s,%s,%s,%s,%s)"
# 执行sql
cursor.execute(sql,(item['job_name'],item['org_name'],item['job_location'],item['max_money'],item['min_money'],item['date']))
import mmh3
from redis import ConnectionPool,StrictRedis
import math
import time
class PyBloomFilter():
#内置100个随机种子
SEEDS = [543, 460, 171, 876, 796, 607, 650, 81, 837, 545, 591, 946, 846, 521, 913, 636, 878, 735, 414, 372,
344, 324, 223, 180, 327, 891, 798, 933, 493, 293, 836, 10, 6, 544, 924, 849, 438, 41, 862, 648, 338,
465, 562, 693, 979, 52, 763, 103, 387, 374, 349, 94, 384, 680, 574, 480, 307, 580, 71, 535, 300, 53,
481, 519, 644, 219, 686, 236, 424, 326, 244, 212, 909, 202, 951, 56, 812, 901, 926, 250, 507, 739, 371,
63, 584, 154, 7, 284, 617, 332, 472, 140, 605, 262, 355, 526, 647, 923, 199, 518]
#capacity是预先估计要去重的数量
#error_rate表示错误率
#conn表示redis的连接客户端
#key表示在redis中的键的名字前缀
def __init__(self, capacity=1000000000, error_rate=0.00000001, conn=None, key='BloomFilter'):
self.m = math.ceil(capacity*math.log2(math.e)*math.log2(1/error_rate)) #需要的总bit位数
self.k = math.ceil(math.log1p(2)*self.m/capacity) #需要最少的hash次数
self.mem = math.ceil(self.m/8/1024/1024) #需要的多少M内存
self.blocknum = math.ceil(self.mem/512) #需要多少个512M的内存块,value的第一个字符必须是ascii码,所有最多有256个内存块
self.seeds = self.SEEDS[0:self.k]
self.key = key
self.N = 2**31-1
self.redis = conn
print(self.mem)
print(self.k)
def add(self, value):
name = self.key + "_" + str(ord(value[0])%self.blocknum)
hashs = self.get_hashs(value)
for hash in hashs:
self.redis.setbit(name, hash, 1)
def is_exist(self, value):
name = self.key + "_" + str(ord(value[0])%self.blocknum)
hashs = self.get_hashs(value)
exist = True
for hash in hashs:
exist = exist & self.redis.getbit(name, hash)
return exist
def get_hashs(self, value):
hashs = list()
for seed in self.seeds:
hash = mmh3.hash(value, seed)
if hash >= 0:
hashs.append(hash)
else:
hashs.append(self.N - hash)
return hashs
pool = ConnectionPool(host='127.0.0.1', port=6379, db=0)
conn = StrictRedis(connection_pool=pool)
# start = time.time()
# bf = PyBloomFilter(conn=conn)
# bf.add('www.jobbole.com')
# bf.add('www.zhihu.com')
# print(bf.is_exist('www.zhihu.com'))
# print(bf.is_exist('www.lagou.com'))
# -*- coding: utf-8 -*-
# Scrapy settings for JobsSpider project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'JobsSpider'
SPIDER_MODULES = ['JobsSpider.spiders']
NEWSPIDER_MODULE = 'JobsSpider.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'JobsSpider (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'JobsSpider.middlewares.JobsspiderSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'JobsSpider.middlewares.CustomUAMiddleware': 1,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware':None
}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'JobsSpider.pipelines.MySQLTwistedPipeline': 1,
'scrapy_redis.pipelines.RedisPipeline':2
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# 数据库地址
MYSQL_HOST = '192.168.10.220'
# 用户名
MYSQL_USER = 'root'
# 密码
MYSQL_PASSWD = '123456'
# 端口
MYSQL_PORT = 3306
# 数据库名称
MYSQL_DBNAME = 'jobs'
# 编码
MYSQL_CHARSET = 'utf8'
SCHEDULER = 'scrapy_redis.scheduler.Scheduler'
DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'
REDIS_URL = 'redis://root:@192.168.10.220:6379'