# -*- coding: utf-8 -*- # Scrapy # from scrapy.conf import settings from scrapy.utils.project import get_project_settings # PyKafka # from pykafka import KafkaClient from kafka import KafkaProducer import json class ScrapyKafkaPipeline(object): def __init__(self): self.settings = get_project_settings() self.producer = KafkaProducer(bootstrap_servers=self.settings['KAFKA_IP_PORT']) def process_item(self, item, spider): data2 = json.dumps({'product_name': item['product_name'], 'item': item['item'], 'currency': item['currency'], 'price': item['price'], 'days7_total_sold': item['days7_total_sold'], 'total_sold': item['total_sold'], 'viewed': item['viewed'], 'shipping_summary': item['shipping_summary'], 'categroy_id': item['categroy_id'], 'start_time': item['start_time'], 'product_url': item['product_url'], 'img_url': item['img_url'], 'store_name': item['store_name'], 'store_url': item['store_url'], 'create_time': item['create_time'], 'ship_country': item["ship_country"], 'ship_area': item["ship_area"], 'category_url': item["category_url"], 'category_url_page': item["category_url_page"], 'is_fire': item["is_fire"]}, sort_keys=True, indent=4, separators=(',', ': ')) self.producer.send(self.settings['KAFKA_TOPIC_NAME'], bytes(data2, encoding='utf-8')) return item def close_spider(self, spider): self.producer.close()
# -*- coding: utf-8 -*- # Scrapy settings for ebay_product_de project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html import random BOT_NAME = 'ebay_product_de' SPIDER_MODULES = ['ebay_product_de.spiders'] NEWSPIDER_MODULE = 'ebay_product_de.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'ebay_product_de (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # 重定向 REDIRECT_ENABLED = True # redis # 使用了scrapy_redis的调度器,在redis里分配请求 SCHEDULER = "scrapy_redis.scheduler.Scheduler" # 使用了scrapy_redis的去重组件,在redis数据库里做去重 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" # 默认的 按优先级排序(Scrapy默认),由sorted set实现的一种非FIFO、LIFO方式。 SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderPriorityQueue' # 在redis中保持scrapy-redis用到的各个队列,从而允许暂停和暂停后恢复,也就是不清理redis queues SCHEDULER_PERSIST = True # # 指定redis数据库的连接参数 # REDIS_HOST = '192.168.99.2' # REDIS_PORT = 6379 # REDIS_PARAMS = {"password": "r1", 'db': 13} # 指定redis数据库的连接参数 REDIS_HOST = '192.168.99.2' REDIS_PORT = 6379 REDIS_PARAMS = {'db': 13} # LOG等级 LOG_LEVEL = 'DEBUG' # 默认情况下,RFPDupeFilter只记录第一个重复请求。将DUPEFILTER_DEBUG设置为True会记录所有重复的请求。 DUPEFILTER_DEBUG = True # Configure maximum concurrent requests performed by Scrapy (default: 16) CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs # 下载延迟 (涉及并发) DOWNLOAD_DELAY = random.triangular(0.4, 0.6) # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) COOKIES_ENABLED = False # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'ebay_product_de.middlewares.EbayProductDeSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { # 'ebay_product_de.middlewares.EbayProductDeDownloaderMiddleware': 543, 'ebay_product_de.middlewares.RandomUserAgentMiddleware': 543, } # Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { # 'ebay_product_de.pipelines.EbayProductDePipeline': 300, # 'ebay_product_de.pipelines.MysqlPipeline': 300, 'ebay_product_de.pipelinesKafka.ScrapyKafkaPipeline': 300, 'scrapy_redis.pipelines.RedisPipeline': 400 } # Enable and configure the AutoThrottle extension (disabled by default) # See https://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' # kafka配置 # Kafka的访问ip或者端口(默认localhost:9092) KAFKA_IP_PORT = ["192.168.99.1:9092"] # Kafka的Topic name #热卖 KAFKA_TOPIC_NAME = "ebay_de_hot" #默认排序 # KAFKA_TOPIC_NAME = "ebay_us_default" DB_HOST = '192.168.99.23' DB_PORT = 3306 DB_USER = 'bd' DB_PASSWORD = 'yree' DB_NAME = 'data' DB_CHARSET = 'utf8mb4' # 判断Redis是否为空 MYEXT_ENABLED = True # 开启扩展 IDLE_NUMBER = 360 # 配置空闲持续时间单位为 60个 ,一个时间单位为5s,也就是五分钟 ,360 就是 30 分钟 # 在 EXTENSIONS 配置,激活扩展 EXTENSIONS = { 'ebay_product_de.extensions.RedisSpiderSmartIdleClosedExensions': 500 } HTTPERROR_ALLOWED_CODES = [400]