首先安装kafka:
pip install kafka-python
向topic写数据:
import time
from pyspark.sql import *
from pyspark.sql import SparkSession
from kafka import KafkaProducer
import json
spark = SparkSession \
.builder \
.appName("spark") \
.enableHiveSupport() \
.getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc)
'''
pyspark读取hive表为dataframe格式
将表中数据转为字典格式
向topic:test-topic中写数据
'''
producer = KafkaProducer(
value_serializer=lambda v: json.dumps(v).encode('utf-8'),
bootstrap_servers=['**********']
)
data = spark.table(*********)
for i in data.collect():
data_dict = {}
for j in range(len(data.columns)):
data_dict[data.columns[j]] = i[j]
producer.send('test-topic', data_dict)
producer.close()
从topic读数据:
from kafka import KafkaConsumer
import json
'''
从test-topic中读取数据
'''
consumer = KafkaConsumer(
'test-topic',
bootstrap_servers=['*******************'],
value_deserializer=json.loads
)
for i in consumer:
print(i.value)