scrapy-redis从redis到kafka生产消费

redis到kafka生产者

# -*- coding: utf-8 -*-
import time
import json
import random
import htmlmin
from lxml import etree
from datetime import datetime
from redis import StrictRedis
from kafka import KafkaProducer
from multiprocessing import Process

# redis
redis_host = '172.16.30.45'
client = StrictRedis(
    host=redis_host, port=6379, db=0
)

#kafka Producer
kafka_host_1 = '172.16.30.25'
kafka_host_2 = '172.16.30.45'
kafka_host_3 = '172.16.30.65'
producer = KafkaProducer(
    value_serializer=lambda v: json.dumps(v).encode('utf-8'),
    bootstrap_servers=['{}:9092'.format(kafka_host_1),'{}:9092'.format(kafka_host_2),'{}:9092'.format(kafka_host_3)]
)

spider_names = ['onion_xs6qb_market_spider','onion_zj4o7_market_spider','onion_c2p3h_market_spider']


def dec2bin(n=0, max_len=8):
    """
    十进制转为对应的二进制字符串,首位不够补0
    :param n: 待转换的字节
    :param max_len: 需要转换为对应字节的长度
    :return:
    """
    tmp_bin = "{0:b}".format(n)
    if len(tmp_bin) < max_len:
        tmp_bin = "0"*(max_len-len(tmp_bin)) + tmp_bin
    return tmp_bin

def generate_data_id(producer_id=1, data_subtype=1, primary_id=4):
    """
    生成数据的data_id,2字节的producer_id+2字节的data_subtype+4字节的数据主键
    :param producer_id:
    :param data_subtype:
    :param primary_id:
    :return:
    """
    temp_str = dec2bin(producer_id, 16) + dec2bin(data_subtype, 16) + dec2bin(primary_id, 32)
    return int("0b"+temp_str, 2)


def transfer_redis_kafka(spider_name):
    while True:
        time.sleep(0.01)
        source, redis_data = client.blpop(["{}:items".format(spider_name)])
        redis_data = json.loads(redis_data.decode("utf-8"))
        response = etree.HTML(redis_data['raw_text'])
        contents = response.xpath("//html//body")[0].xpath("string(.)")
        kafka_data = {
            "data_id": generate_data_id(1,2, int(time.time())),
            "url": redis_data['url'],
            "content":"\n".join(contents),
            "crawl_time": redis_data['crawl_time'],
            "domain_name": redis_data['domain'],
            "title": redis_data['raw_title'],
            "html": htmlmin.minify(redis_data['raw_text'].encode('utf-8').decode('utf-8'), remove_all_empty_space=True),
            "language": redis_data['language'],
            "data_type":'webpage',
            "net_type": redis_data['net_type'],
            "h1": redis_data['h1'],
            "meta": redis_data['meta'],
            "headers": redis_data['headers'],
            "content_type": redis_data['content_type'],
            "content_encode": redis_data['content_encode'],
            "code": redis_data['code'],
            "links": redis_data['links'],
            "gmt_create": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S"),
            "gmt_modified": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S"),
        }
        print('web:',kafka_data)
        producer.send('demo.crawled_anwangzhongwen', kafka_data)
    # producer.close()

def task_schdule():
    processes = []
    for i, spider_name in enumerate(spider_names):
        process = Process(target=transfer_redis_kafka, args=(spider_name,))
        processes.append(process)
        processes[i].start()


if __name__ == '__main__':
    task_schdule()

kafka消费者

from kafka import KafkaConsumer

kafka_host_1 = '172.16.30.25'
kafka_host_2 = '172.16.30.45'
kafka_host_3 = '172.16.30.65'
consumer = KafkaConsumer('demo.crawled_anwangzhongwen',group_id="tor",
                         bootstrap_servers=['{}:9092'.format(kafka_host_1),'{}:9092'.format(kafka_host_2),'{}:9092'.format(kafka_host_3)],
                         auto_offset_reset='earliest',value_deserializer=json.loads
                         )

for data_kafka in consumer:
    print(data_kafka.value)
  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值